From 2ad8cfc559a3fdc024a6a782add80ccf94b1521b Mon Sep 17 00:00:00 2001 From: pankrat Date: Sun, 28 Sep 2008 13:04:03 +0000 Subject: [PATCH] Issue 1646: Block-wise signature computation for large files git-svn-id: http://scons.tigris.org/svn/scons/trunk@3514 fdb21ef1-2011-0410-befe-b5e4ea1792b1 --- doc/man/scons.1 | 13 +++ src/engine/SCons/Node/FS.py | 32 ++++++- src/engine/SCons/Script/Main.py | 3 + src/engine/SCons/Script/SConsOptions.py | 13 +++ src/engine/SCons/Util.py | 17 ++++ test/option/md5-chunksize.py | 121 ++++++++++++++++++++++++ 6 files changed, 195 insertions(+), 4 deletions(-) create mode 100644 test/option/md5-chunksize.py diff --git a/doc/man/scons.1 b/doc/man/scons.1 index 1e3e47bd..f080bd73 100644 --- a/doc/man/scons.1 +++ b/doc/man/scons.1 @@ -1070,6 +1070,19 @@ signature and to ignore the cached value if there already is one. A value of 0 means to always use the cached signature, no matter how old the file is. +.TP +.RI --md5-chunksize= KILOBYTES +Set the block size used to compute MD5 signatures to +.IR KILOBYTES . +This value determines the size of the chunks which are read in at once when +computing MD5 signatures. Files below that size are fully stored in memory +before performing the signature computation while bigger files are read in +block-by-block. A huge block-size leads to high memory consumption while a very +small block-size slows down the build considerably. + +The default value is to use a chunk size of 64 kilobytes, which should +be appropriate for most uses. + .TP -n, --just-print, --dry-run, --recon No execute. Print the commands that would be executed to build diff --git a/src/engine/SCons/Node/FS.py b/src/engine/SCons/Node/FS.py index 02dcdbfc..8301b152 100644 --- a/src/engine/SCons/Node/FS.py +++ b/src/engine/SCons/Node/FS.py @@ -2160,6 +2160,8 @@ class File(Base): NodeInfo = FileNodeInfo BuildInfo = FileBuildInfo + md5_chunksize = 64 + def diskcheck_match(self): diskcheck_match(self, self.isdir, "Directory %s found where file expected.") @@ -2233,6 +2235,23 @@ class File(Base): raise return r + def get_content_hash(self): + """ + Compute and return the MD5 hash for this file. + """ + if not self.rexists(): + return SCons.Util.MD5signature('') + fname = self.rfile().abspath + try: + cs = SCons.Util.MD5filesignature(fname, + chunksize=SCons.Node.FS.File.md5_chunksize*1024) + except EnvironmentError, e: + if not e.filename: + e.filename = fname + raise + return cs + + memoizer_counters.append(SCons.Memoize.CountValue('get_size')) def get_size(self): @@ -2697,7 +2716,10 @@ class File(Base): if csig is None: try: - contents = self.get_contents() + if self.get_size() < SCons.Node.FS.File.md5_chunksize: + contents = self.get_contents() + else: + csig = self.get_content_hash() except IOError: # This can happen if there's actually a directory on-disk, # which can be the case if they've disabled disk checks, @@ -2705,7 +2727,8 @@ class File(Base): # create a same-named directory by mistake. csig = '' else: - csig = SCons.Util.MD5signature(contents) + if not csig: + csig = SCons.Util.MD5signature(contents) ninfo.csig = csig @@ -2833,8 +2856,8 @@ class File(Base): cachedir, cachefile = self.get_build_env().get_CacheDir().cachepath(self) if not self.exists() and cachefile and os.path.exists(cachefile): - contents = open(cachefile, 'rb').read() - self.cachedir_csig = SCons.Util.MD5signature(contents) + self.cachedir_csig = SCons.Util.MD5filesignature(cachefile, \ + SCons.Node.FS.File.md5_chunksize * 1024) else: self.cachedir_csig = self.get_csig() return self.cachedir_csig @@ -2856,6 +2879,7 @@ class File(Base): self.cachesig = SCons.Util.MD5collect(sigs) return self.cachesig + default_fs = None def get_default_fs(): diff --git a/src/engine/SCons/Script/Main.py b/src/engine/SCons/Script/Main.py index 76b94e07..048dfdd1 100644 --- a/src/engine/SCons/Script/Main.py +++ b/src/engine/SCons/Script/Main.py @@ -966,6 +966,9 @@ def _main(parser): SCons.Job.explicit_stack_size = options.stack_size + if options.md5_chunksize: + SCons.Node.FS.File.md5_chunksize = options.md5_chunksize + platform = SCons.Platform.platform_module() if options.interactive: diff --git a/src/engine/SCons/Script/SConsOptions.py b/src/engine/SCons/Script/SConsOptions.py index 0e28fd29..d01ec043 100644 --- a/src/engine/SCons/Script/SConsOptions.py +++ b/src/engine/SCons/Script/SConsOptions.py @@ -126,6 +126,7 @@ class SConsValues(optparse.Values): 'help', 'implicit_cache', 'max_drift', + 'md5_chunksize', 'no_exec', 'num_jobs', 'random', @@ -177,6 +178,11 @@ class SConsValues(optparse.Values): value = int(value) except ValueError: raise SCons.Errors.UserError, "An integer is required: %s"%repr(value) + elif name == 'md5_chunksize': + try: + value = int(value) + except ValueError: + raise SCons.Errors.UserError, "An integer is required: %s"%repr(value) elif name == 'warn': if SCons.Util.is_String(value): value = [value] @@ -726,6 +732,13 @@ def Parser(version): help="Set maximum system clock drift to N seconds.", metavar="N") + op.add_option('--md5-chunksize', + nargs=1, type="int", + dest='md5_chunksize', default=SCons.Node.FS.File.md5_chunksize, + action="store", + help="Set chunk-size for MD5 signature computation to N kilobytes.", + metavar="N") + op.add_option('-n', '--no-exec', '--just-print', '--dry-run', '--recon', dest='no_exec', default=False, action="store_true", diff --git a/src/engine/SCons/Util.py b/src/engine/SCons/Util.py index 3fdc14c3..7ea36732 100644 --- a/src/engine/SCons/Util.py +++ b/src/engine/SCons/Util.py @@ -1496,6 +1496,12 @@ md5 = False def MD5signature(s): return str(s) +def MD5filesignature(fname, chunksize=65536): + f = open(fname, "rb") + result = f.read() + f.close() + return result + try: import hashlib except ImportError: @@ -1508,6 +1514,17 @@ else: m.update(str(s)) return m.hexdigest() + def MD5filesignature(fname, chunksize=65536): + m = hashlib.md5() + f = open(fname, "rb") + while 1: + blck = f.read(chunksize) + if not blck: + break + m.update(str(blck)) + f.close() + return m.hexdigest() + def MD5collect(signatures): """ Collects a list of signatures into an aggregate signature. diff --git a/test/option/md5-chunksize.py b/test/option/md5-chunksize.py new file mode 100644 index 00000000..8c294801 --- /dev/null +++ b/test/option/md5-chunksize.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# +# __COPYRIGHT__ +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +__revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__" + +import string +import sys + +import TestSCons + +_python_ = TestSCons._python_ + +test = TestSCons.TestSCons() + +test.write('build.py', r""" +import sys +contents = open(sys.argv[2], 'rb').read() +file = open(sys.argv[1], 'wb') +file.write(contents) +file.close() +""") + +test.write('SConstruct', """ +SetOption('md5_chunksize', 128) +B = Builder(action = r'%(_python_)s build.py $TARGETS $SOURCES') +env = Environment(BUILDERS = { 'B' : B }) +f1 = env.B(target = 'f1.out', source = 'f1.in') +f2 = env.B(target = 'f2.out', source = 'f2.in') +Requires(f2, f1) +""" % locals()) + +test.write('f1.in', str(range(10))) +test.write('f2.in', str(range(100000))) + +expected_stdout = test.wrap_stdout("""\ +%(_python_)s build.py f1.out f1.in +%(_python_)s build.py f2.out f2.in +""" % locals()) + +# +# Test with SetOption('md5_chunksize') +# +test.run(arguments = '.', + stdout=expected_stdout, + stderr='') +test.must_exist('f1.out') +test.must_exist('f2.out') + +test.run(arguments = '-c .') +test.must_not_exist('f1.out') +test.must_not_exist('f2.out') + +# +# Test with --md5-chunksize +# +test.run(arguments = '--md5-chunksize=128 .', + stdout=expected_stdout, + stderr='') +test.must_exist('f1.out') +test.must_exist('f2.out') + +test.run(arguments = '--md5-chunksize=128 -c .') +test.must_not_exist('f1.out') +test.must_not_exist('f2.out') + +test.pass_test() + +# +# Big-file test +# +test2 = TestSCons.TestSCons() + +if string.find(sys.platform, 'linux') == -1: + test2.skip_test("skipping test on non-Linux platform '%s'\n" % sys.platform) + +dd = test2.where_is('dd') + +if not dd: + test2.skip_test('dd not found; skipping test\n') + +expected_stdout = test2.wrap_stdout("""\ +dd if=/dev/zero of=test.big seek=100 bs=1M count=0 2>/dev/null +get_stat(["test.stat"], ["test.big"]) +""") + +test2.write('SConstruct', """ +import os +def get_stat(target, source, env): + stat = os.stat(source[0].abspath) + dest = open(target[0].abspath,'w') + dest.write(str(stat)) + dest.close() +env = Environment() +env.Command('test.big', 'SConstruct', 'dd if=/dev/zero of=test.big seek=100 bs=1M count=0 2>/dev/null') +env.AlwaysBuild('test.big') +env.Command('test.stat', 'test.big', Action(get_stat)) +""") + +test2.run(arguments='--md5-chunksize=128', stdout=expected_stdout, stderr='') +test2.pass_test() -- 2.26.2