From: stevenknight Date: Sun, 25 Sep 2005 18:12:01 +0000 (+0000) Subject: Optimize is_List et al. Add a script harness and scripts for benchmarking Python... X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=a398c050f75a1e8ffc0808284b388b618e5e50e2;p=scons.git Optimize is_List et al. Add a script harness and scripts for benchmarking Python code snippets. git-svn-id: http://scons.tigris.org/svn/scons/trunk@1350 fdb21ef1-2011-0410-befe-b5e4ea1792b1 --- diff --git a/README b/README index f34c9c29..e178bc29 100644 --- a/README +++ b/README @@ -316,6 +316,13 @@ CONTENTS OF THIS PACKAGE Not guaranteed to be up-to-date (but better than nothing): +bench/ + A subdirectory for benchmarking scripts, used to perform timing + tests to decide what specific idioms are most efficient for + various parts of the code base. We check these in so they're + available in case we have to revisit any of these decisions in + the future. + bin/ Miscellaneous utilities used in SCons development. Right now, some of the stuff here includes: diff --git a/bench/README.txt b/bench/README.txt new file mode 100644 index 00000000..03931076 --- /dev/null +++ b/bench/README.txt @@ -0,0 +1,40 @@ +# __COPYRIGHT__ + +This subdirectory contains a harness and various timing tests that we've +used to decide on the most efficient implementation of various pieces +of the code base. We're checking these in here so that they're always +available in case we have to revisit these decisions. + +NOTE: This harness is for horse-racing specific snippets of Python +code to select the best implementation to use within a given function +or subsystem. It's not intended for end-to-end testing of SCons itself. + +Contents of the directory: + + README.txt + + What you're reading right now. + + bench.py + + The harness for running the timing tests that make up + the rest of the directory's contents. Use it to run + one of the timing tests as follows: + + python bench.py FILE + + Various command-line options control the number of runs, the + number of iterations on each run, etc. Help for the command-line + options is available: + + python bench.py -h + + is_types.py + lvars-gvars.py + [etc.] + + The rest of the files in this directory should each contain a + specific timing test, consisting of various functions to be run + against each other, and test data to be passed to the functions. + + Yes, this list of files will get out of date. diff --git a/bench/bench.py b/bench/bench.py new file mode 100644 index 00000000..3c5dd50b --- /dev/null +++ b/bench/bench.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# +# __COPYRIGHT__ +# +# A script for timing snippets of Python code. +# +# By default, this script will execute a single Python file specified on +# the command line and time any functions in a list named "FunctionList" +# set by the Python file under test, or (by default) time any functions +# in the file whose names begin with "Func". +# +# All functions are assumed to get passed the same arguments, and the +# inputs are specified in a list named "Data," each element of which +# is a list consisting of a tag name, a list of positional arguments, +# and a dictionary of keyword arguments. +# +# Each function is expected to test a single, comparable snippet of +# of Python code. IMPORTANT: We want to test the timing of the code +# itself, not Python function call overhead, so every function should +# put its code under test within the following block: +# +# for i in IterationList: +# +# This will allow (as much as possible) us to time just the code itself, +# not Python function call overhead. + +import getopt +import sys +import time +import types + +Usage = """\ +Usage: bench.py OPTIONS file.py + --clock Use the time.clock function + --func PREFIX Test functions whose names begin with PREFIX + -h, --help Display this help and exit + -i ITER, --iterations ITER Run each code snippet ITER times + --time Use the time.time function + -r RUNS, --runs RUNS Average times for RUNS invocations of +""" + +# How many times each snippet of code will be (or should be) run by the +# functions under test to gather the time (the "inner loop"). + +Iterations = 1000 + +# How many times we'll run each function to collect its aggregate time +# and try to average out timing differences induced by system performance +# (the "outer loop"). + +Runs = 10 + +# The prefix of the functions under test. This will be used if +# there's no explicit list defined in FunctionList. + +FunctionPrefix = 'Func' + +# The function used to get the current time. The default of time.time is +# good on most UNIX systems, but time.clock (selectable via the --clock +# option) is better on Windows and some other UNIX systems. + +Now = time.time + + +opts, args = getopt.getopt(sys.argv[1:], 'hi:r:', + ['clock', 'func=', 'help', + 'iterations=', 'time', 'runs=']) + +for o, a in opts: + if o in ['--clock']: + Now = time.clock + elif o in ['--func']: + FunctionPrefix = a + elif o in ['-h', '--help']: + sys.stdout.write(Usage) + sys.exit(0) + elif o in ['-i', '--iterations']: + Iterations = int(a) + elif o in ['--time']: + Now = time.time + elif o in ['-r', '--runs']: + Runs = int(a) + +if len(args) != 1: + sys.stderr.write("bench.py: only one file argument must be specified\n") + sys.stderr.write(Usage) + sys.exit(1) + + +execfile(args[0]) + + +try: + FunctionList +except NameError: + function_names = filter(lambda x: x[:4] == FunctionPrefix, locals().keys()) + function_names.sort() + l = map(lambda f, l=locals(): l[f], function_names) + FunctionList = filter(lambda f: type(f) == types.FunctionType, l) + +IterationList = [None] * Iterations + +def timer(func, *args, **kw): + results = [] + for i in range(Runs): + start = Now() + apply(func, args, kw) + finish = Now() + results.append((finish - start) / Iterations) + return results + +def display(label, results): + total = reduce(lambda x, y: x+y, results, 0.0) + print " %8.3f" % ((total * 1e6) / len(results)), ':', label + +for func in FunctionList: + if func.__doc__: d = ' (' + func.__doc__ + ')' + else: d = '' + print func.__name__ + d + ':' + + for label, args, kw in Data: + r = apply(timer, (func,)+args, kw) + display(label, r) diff --git a/bench/is_types.py b/bench/is_types.py new file mode 100644 index 00000000..1f4805b9 --- /dev/null +++ b/bench/is_types.py @@ -0,0 +1,331 @@ +# __COPYRIGHT__ +# +# Benchmarks for testing various possible implementations +# of the is_Dict(), is_List() and is_String() functions in +# src/engine/SCons/Util.py. + +import types +from UserDict import UserDict +from UserList import UserList + +try: + from UserString import UserString +except ImportError: + # "Borrowed" from the Python 2.2 UserString module + # and modified slightly for use with SCons. + class UserString: + def __init__(self, seq): + if type(seq) == type(''): + self.data = seq + elif isinstance(seq, UserString): + self.data = seq.data[:] + else: + self.data = str(seq) + def __str__(self): return str(self.data) + def __repr__(self): return repr(self.data) + def __int__(self): return int(self.data) + def __long__(self): return long(self.data) + def __float__(self): return float(self.data) + def __complex__(self): return complex(self.data) + def __hash__(self): return hash(self.data) + + def __cmp__(self, string): + if isinstance(string, UserString): + return cmp(self.data, string.data) + else: + return cmp(self.data, string) + def __contains__(self, char): + return char in self.data + + def __len__(self): return len(self.data) + def __getitem__(self, index): return self.__class__(self.data[index]) + def __getslice__(self, start, end): + start = max(start, 0); end = max(end, 0) + return self.__class__(self.data[start:end]) + + def __add__(self, other): + if isinstance(other, UserString): + return self.__class__(self.data + other.data) + elif is_String(other): + return self.__class__(self.data + other) + else: + return self.__class__(self.data + str(other)) + def __radd__(self, other): + if is_String(other): + return self.__class__(other + self.data) + else: + return self.__class__(str(other) + self.data) + def __mul__(self, n): + return self.__class__(self.data*n) + __rmul__ = __mul__ + +InstanceType = types.InstanceType +DictType = types.DictType +ListType = types.ListType +StringType = types.StringType +if hasattr(types, 'UnicodeType'): + UnicodeType = types.UnicodeType + + +# The original implementations, pretty straightforward checks for the +# type of the object and whether it's an instance of the corresponding +# User* type. + +def original_is_Dict(e): + return type(e) is types.DictType or isinstance(e, UserDict) + +def original_is_List(e): + return type(e) is types.ListType or isinstance(e, UserList) + +if hasattr(types, 'UnicodeType'): + def original_is_String(e): + return type(e) is types.StringType \ + or type(e) is types.UnicodeType \ + or isinstance(e, UserString) +else: + def original_is_String(e): + return type(e) is types.StringType or isinstance(e, UserString) + + + +# New candidates that explicitly check for whether the object is an +# InstanceType before calling isinstance() on the corresponding User* +# type. + +def checkInstanceType_is_Dict(e): + return type(e) is types.DictType or \ + (type(e) is types.InstanceType and isinstance(e, UserDict)) + +def checkInstanceType_is_List(e): + return type(e) is types.ListType \ + or (type(e) is types.InstanceType and isinstance(e, UserList)) + +if hasattr(types, 'UnicodeType'): + def checkInstanceType_is_String(e): + return type(e) is types.StringType \ + or type(e) is types.UnicodeType \ + or (type(e) is types.InstanceType and isinstance(e, UserString)) +else: + def checkInstanceType_is_String(e): + return type(e) is types.StringType \ + or (type(e) is types.InstanceType and isinstance(e, UserString)) + + + +# Improved candidates that cache the type(e) result in a variable +# before doing any checks. + +def cache_type_e_is_Dict(e): + t = type(e) + return t is types.DictType or \ + (t is types.InstanceType and isinstance(e, UserDict)) + +def cache_type_e_is_List(e): + t = type(e) + return t is types.ListType \ + or (t is types.InstanceType and isinstance(e, UserList)) + +if hasattr(types, 'UnicodeType'): + def cache_type_e_is_String(e): + t = type(e) + return t is types.StringType \ + or t is types.UnicodeType \ + or (t is types.InstanceType and isinstance(e, UserString)) +else: + def cache_type_e_is_String(e): + t = type(e) + return t is types.StringType \ + or (t is types.InstanceType and isinstance(e, UserString)) + + + +# Improved candidates that cache the type(e) result in a variable +# before doing any checks, but using the global names for +# DictType, ListType and StringType. + +def global_cache_type_e_is_Dict(e): + t = type(e) + return t is DictType or \ + (t is InstanceType and isinstance(e, UserDict)) + +def global_cache_type_e_is_List(e): + t = type(e) + return t is ListType \ + or (t is InstanceType and isinstance(e, UserList)) + +if hasattr(types, 'UnicodeType'): + def global_cache_type_e_is_String(e): + t = type(e) + return t is StringType \ + or t is UnicodeType \ + or (t is InstanceType and isinstance(e, UserString)) +else: + def global_cache_type_e_is_String(e): + t = type(e) + return t is StringType \ + or (t is InstanceType and isinstance(e, UserString)) + + + +# Alternative that uses a myType() function to map the User* objects +# to their corresponding underlying types. + +instanceTypeMap = { + UserDict : types.DictType, + UserList : types.ListType, + UserString : types.StringType, +} + +if hasattr(types, 'UnicodeType'): + def myType(obj): + t = type(obj) + if t is types.InstanceType: + t = instanceTypeMap.get(obj.__class__, t) + elif t is types.UnicodeType: + t = types.StringType + return t +else: + def myType(obj): + t = type(obj) + if t is types.InstanceType: + t = instanceTypeMap.get(obj.__class__, t) + return t + +def myType_is_Dict(e): + return myType(e) is types.DictType + +def myType_is_List(e): + return myType(e) is types.ListType + +def myType_is_String(e): + return myType(e) is types.StringType + + + + +def Func01(obj): + """original_is_String""" + for i in IterationList: + original_is_String(obj) + +def Func02(obj): + """original_is_List""" + for i in IterationList: + original_is_List(obj) + +def Func03(obj): + """original_is_Dict""" + for i in IterationList: + original_is_Dict(obj) + +def Func04(obj): + """checkInstanceType_is_String""" + for i in IterationList: + checkInstanceType_is_String(obj) + +def Func05(obj): + """checkInstanceType_is_List""" + for i in IterationList: + checkInstanceType_is_List(obj) + +def Func06(obj): + """checkInstanceType_is_Dict""" + for i in IterationList: + checkInstanceType_is_Dict(obj) + +def Func07(obj): + """cache_type_e_is_String""" + for i in IterationList: + cache_type_e_is_String(obj) + +def Func08(obj): + """cache_type_e_is_List""" + for i in IterationList: + cache_type_e_is_List(obj) + +def Func09(obj): + """cache_type_e_is_Dict""" + for i in IterationList: + cache_type_e_is_Dict(obj) + +def Func10(obj): + """global_cache_type_e_is_String""" + for i in IterationList: + global_cache_type_e_is_String(obj) + +def Func11(obj): + """global_cache_type_e_is_List""" + for i in IterationList: + global_cache_type_e_is_List(obj) + +def Func12(obj): + """global_cache_type_e_is_Dict""" + for i in IterationList: + global_cache_type_e_is_Dict(obj) + +#def Func13(obj): +# """myType_is_String""" +# for i in IterationList: +# myType_is_String(obj) +# +#def Func14(obj): +# """myType_is_List""" +# for i in IterationList: +# myType_is_List(obj) +# +#def Func15(obj): +# """myType_is_Dict""" +# for i in IterationList: +# myType_is_Dict(obj) + + + +# Data to pass to the functions on each run. Each entry is a +# three-element tuple: +# +# ( +# "Label to print describing this data run", +# ('positional', 'arguments'), +# {'keyword' : 'arguments'}, +# ), + +class A: + pass + +Data = [ + ( + "String", + ('',), + {}, + ), + ( + "List", + ([],), + {}, + ), + ( + "Dict", + ({},), + {}, + ), + ( + "UserString", + (UserString(''),), + {}, + ), + ( + "UserList", + (UserList([]),), + {}, + ), + ( + "UserDict", + (UserDict({}),), + {}, + ), + ( + "Object", + (A(),), + {}, + ), +] diff --git a/bench/lvars-gvars.py b/bench/lvars-gvars.py new file mode 100644 index 00000000..efcef2ad --- /dev/null +++ b/bench/lvars-gvars.py @@ -0,0 +1,74 @@ +# __COPYRIGHT__ +# +# Functions and data for timing different idioms for fetching a keyword +# value from a pair of dictionaries for localand global values. This was +# used to select how to most efficiently expand single $KEYWORD strings +# in src/engine/SCons/Subst.py. + +def Func1(var, gvars, lvars): + """lvars try:-except:, gvars try:-except:""" + for i in IterationList: + try: + x = lvars[var] + except KeyError: + try: + x = gvars[var] + except KeyError: + x = '' + +def Func2(var, gvars, lvars): + """lvars has_key(), gvars try:-except:""" + for i in IterationList: + if lvars.has_key(var): + x = lvars[var] + else: + try: + x = gvars[var] + except KeyError: + x = '' + +def Func3(var, gvars, lvars): + """lvars has_key(), gvars has_key()""" + for i in IterationList: + if lvars.has_key(var): + x = lvars[var] + elif gvars.has_key(var): + x = gvars[var] + else: + x = '' + +def Func4(var, gvars, lvars): + """eval()""" + for i in IterationList: + try: + x = eval(var, gvars, lvars) + except NameError: + x = '' + +# Data to pass to the functions on each run. Each entry is a +# three-element tuple: +# +# ( +# "Label to print describing this data run", +# ('positional', 'arguments'), +# {'keyword' : 'arguments'}, +# ), + +Data = [ + ( + "Neither in gvars or lvars", + ('x', {}, {}), + {}, + ), + ( + "Missing from lvars, found in gvars", + ('x', {'x':1}, {}), + {}, + ), + ( + "Found in lvars", + ('x', {'x':1}, {'x':2}), + {}, + ), +] + diff --git a/bin/bench.py b/bin/bench.py deleted file mode 100644 index 7d9e2ba0..00000000 --- a/bin/bench.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python -# -# A script for timing snippets of Python code. - -import time - -Now = time.time -#Now = time.clock # better on Windows, some UNIX/Linux systems - -# How many times each snippet of code will be run by the functions -# to gather the time (the "inner loop"). - -Iterations = 10000 - -# How many times we'll run each function to collect its aggregate time -# and try to average out timing differences induced by system performance -# (the "outer loop"). - -Runs = 10 - -# The functions containing the code snippets to test and compare. -# This script will test any function whose name begins with the string -# "Func" and assumes that they all get passed the same arguments. -# Each function should put its snippet within a block: -# -# for i in IterationList: -# -# So that (as much as possible) we're testing just the code itself, -# not Python function call overhead. - -def Func1(var, gvars, lvars): - for i in IterationList: - try: - x = lvars[var] - except KeyError: - try: - x = gvars[var] - except KeyError: - x = '' - -def Func2(var, gvars, lvars): - for i in IterationList: - if lvars.has_key(var): - x = lvars[var] - else: - try: - x = gvars[var] - except KeyError: - x = '' - -def Func3(var, gvars, lvars): - for i in IterationList: - if lvars.has_key(var): - x = lvars[var] - elif gvars.has_key(var): - x = gvars[var] - else: - x = '' - -def Func4(var, gvars, lvars): - for i in IterationList: - try: - x = eval(var, gvars, lvars) - except NameError: - x = '' - -# Data to pass to the functions on each run. Each entry is a -# three-element tuple: -# -# ( -# "Label to print describing this data run", -# ('positional', 'arguments'), -# {'keyword' : 'arguments'}, -# ), - -Data = [ - ( - "Neither in gvars or lvars", - ('x', {}, {}), - {}, - ), - ( - "Missing from lvars, found in gvars", - ('x', {'x':1}, {}), - {}, - ), - ( - "Found in lvars", - ('x', {'x':1}, {'x':2}), - {}, - ), -] - - - -IterationList = [None] - -def timer(func, *args, **kw): - global IterationList - IterationList = [None] * Iterations - results = [] - for i in range(Runs): - start = Now() - func(*args, **kw) - finish = Now() - results.append((finish - start) / Iterations) - return results - -def display(label, results): - print ' ' + label + ':' - print ' ', - for r in results: - print "%.3f" % (r * 1e6), - print - -def display(label, results): - total = reduce(lambda x, y: x+y, results, 0.0) - print " %8.3f" % ((total * 1e6) / len(results)), ':', label - -func_names = filter(lambda x: x.startswith('Func'), locals().keys()) -func_names.sort() - -for f in func_names: - func = locals()[f] - print f + ':' - - for label, args, kw in Data: - r = apply(timer, (func,)+args, kw) - display(label, r) diff --git a/src/engine/SCons/Util.py b/src/engine/SCons/Util.py index 958b8c7f..b7fcec4c 100644 --- a/src/engine/SCons/Util.py +++ b/src/engine/SCons/Util.py @@ -42,6 +42,13 @@ import types from UserDict import UserDict from UserList import UserList +# Don't "from types import ..." these because we need to get at the +# types module later to look for UnicodeType. +DictType = types.DictType +InstanceType = types.InstanceType +ListType = types.ListType +StringType = types.StringType + try: from UserString import UserString except ImportError: @@ -164,12 +171,13 @@ def updrive(path): # specified object has one. # if hasattr(types, 'UnicodeType'): + UnicodeType = types.UnicodeType def to_String(s): if isinstance(s, UserString): t = type(s.data) else: t = type(s) - if t is types.UnicodeType: + if t is UnicodeType: return unicode(s) else: return str(s) @@ -378,20 +386,55 @@ def print_tree(root, child_func, prune=0, showtags=0, margin=[0], visited={}): print_tree(children[-1], child_func, prune, IDX(showtags), margin, visited) margin.pop() -def is_Dict(e): - return type(e) is types.DictType or isinstance(e, UserDict) -def is_List(e): - return type(e) is types.ListType or isinstance(e, UserList) + +# Functions for deciding if things are like various types, mainly to +# handle UserDict, UserList and UserString like their underlying types. +# +# Yes, all of this manual testing breaks polymorphism, and the real +# Pythonic way to do all of this would be to just try it and handle the +# exception, but handling the exception when it's not the right type is +# too slow. +# +# The actual implementations here have been selected after timings +# coded up in in bench/is_types.py (from the SCons source tree, see the +# scons-src distribution). Key results from those timings: +# +# -- Storing the type of the object in a variable (t = type(obj)) +# slows down the case where it's a native type and the first +# comparison will match, but nicely speeds up the case where +# it's a different native type. Since that's going to be common, +# it's a good tradeoff. +# +# -- The data show that calling isinstance() on an object that's +# a native type (dict, list or string) is expensive enough that +# checking up front for whether the object is of type InstanceType +# is a pretty big win, even though it does slow down the case +# where it really *is* an object instance a little bit. + +def is_Dict(obj): + t = type(obj) + return t is DictType or \ + (t is InstanceType and isinstance(obj, UserDict)) + +def is_List(obj): + t = type(obj) + return t is ListType \ + or (t is InstanceType and isinstance(obj, UserList)) if hasattr(types, 'UnicodeType'): - def is_String(e): - return type(e) is types.StringType \ - or type(e) is types.UnicodeType \ - or isinstance(e, UserString) + def is_String(obj): + t = type(obj) + return t is StringType \ + or t is UnicodeType \ + or (t is InstanceType and isinstance(obj, UserString)) else: - def is_String(e): - return type(e) is types.StringType or isinstance(e, UserString) + def is_String(obj): + t = type(obj) + return t is StringType \ + or (t is InstanceType and isinstance(obj, UserString)) + + def is_Scalar(e): return is_String(e) or not is_List(e)