if java_parsing:
# Parse Java files for class names.
#
- # This is a really simple and cool parser from Charles Crain
+ # This is a really cool parser from Charles Crain
# that finds appropriate class names in Java source.
- _reToken = re.compile(r'[^\\]([\'"])|([\{\}])|' +
- r'(?:^|[\{\}\s;])((?:class|interface)'+
- r'\s+[A-Za-z_]\w*)|' +
- r'(new\s+[A-Za-z_]\w*\s*\([^\)]*\)\s*\{)|' +
- r'(//[^\r\n]*)|(/\*|\*/)')
+ # A regular expression that will find, in a java file,
+ # any alphanumeric token (keyword, class name, specifier); open or
+ # close brackets; a single-line comment "//"; the multi-line comment
+ # begin and end tokens /* and */; single or double quotes; and
+ # single or double quotes preceeded by a backslash.
+ _reToken = re.compile(r'(//[^\r\n]*|\\[\'"]|[\'"\{\}]|[A-Za-z_][\w\.]*|' +
+ r'/\*|\*/)')
class OuterState:
+ """The initial state for parsing a Java file for classes,
+ interfaces, and anonymous inner classes."""
def __init__(self):
self.listClasses = []
self.listOutputs = []
self.stackBrackets = []
self.brackets = 0
self.nextAnon = 1
+ self.package = None
+
+ def __getClassState(self):
+ try:
+ return self.classState
+ except AttributeError:
+ ret = ClassState(self)
+ self.classState = ret
+ return ret
+
+ def __getPackageState(self):
+ try:
+ return self.packageState
+ except AttributeError:
+ ret = PackageState(self)
+ self.packageState = ret
+ return ret
+
+ def __getAnonClassState(self):
+ try:
+ return self.anonState
+ except AttributeError:
+ ret = SkipState(1, AnonClassState(self))
+ self.anonState = ret
+ return ret
+
+ def __getSkipState(self):
+ try:
+ return self.skipState
+ except AttributeError:
+ ret = SkipState(1, self)
+ self.skipState = ret
+ return ret
def parseToken(self, token):
- #print token
if token[:2] == '//':
pass # ignore comment
elif token == '/*':
self.listOutputs.append(string.join(self.listClasses, '$'))
self.listClasses.pop()
self.stackBrackets.pop()
- elif token == '"':
- return IgnoreState('"', self)
- elif token == "'":
- return IgnoreState("'", self)
- elif token[:3] == "new":
+ elif token == '"' or token == "'":
+ return IgnoreState(token, self)
+ elif token == "new":
# anonymous inner class
if len(self.listClasses) > 0:
- clazz = self.listClasses[0]
- self.listOutputs.append('%s$%d' % (clazz, self.nextAnon))
- self.brackets = self.brackets + 1
- self.nextAnon = self.nextAnon + 1
- elif token[:5] == 'class':
+ return self.__getAnonClassState()
+ return self.__getSkipState() # Skip the class name
+ elif token == 'class' or token == 'interface':
if len(self.listClasses) == 0:
self.nextAnon = 1
- self.listClasses.append(string.join(string.split(token[6:])))
- self.stackBrackets.append(self.brackets)
- elif token[:9] == 'interface':
- if len(self.listClasses) == 0:
- self.nextAnon = 1
- self.listClasses.append(string.join(string.split(token[10:])))
self.stackBrackets.append(self.brackets)
+ return self.__getClassState()
+ elif token == 'package':
+ return self.__getPackageState()
return self
+ def addAnonClass(self):
+ """Add an anonymous inner class"""
+ clazz = self.listClasses[0]
+ self.listOutputs.append('%s$%d' % (clazz, self.nextAnon))
+ self.brackets = self.brackets + 1
+ self.nextAnon = self.nextAnon + 1
+
+ def setPackage(self, package):
+ self.package = package
+
+ class AnonClassState:
+ """A state that looks for anonymous inner classes."""
+ def __init__(self, outer_state):
+ # outer_state is always an instance of OuterState
+ self.outer_state = outer_state
+ self.tokens_to_find = 2
+ def parseToken(self, token):
+ # This is an anonymous class if and only if the next token is a bracket
+ if token == '{':
+ self.outer_state.addAnonClass()
+ return self.outer_state
+
+ class SkipState:
+ """A state that will skip a specified number of tokens before
+ reverting to the previous state."""
+ def __init__(self, tokens_to_skip, old_state):
+ self.tokens_to_skip = tokens_to_skip
+ self.old_state = old_state
+ def parseToken(self, token):
+ self.tokens_to_skip = self.tokens_to_skip - 1
+ if self.tokens_to_skip < 1:
+ return self.old_state
+ return self
+
+ class ClassState:
+ """A state we go into when we hit a class or interface keyword."""
+ def __init__(self, outer_state):
+ # outer_state is always an instance of OuterState
+ self.outer_state = outer_state
+ def parseToken(self, token):
+ # the only token we get should be the name of the class.
+ self.outer_state.listClasses.append(token)
+ return self.outer_state
+
class IgnoreState:
+ """A state that will ignore all tokens until it gets to a
+ specified token."""
def __init__(self, ignore_until, old_state):
self.ignore_until = ignore_until
self.old_state = old_state
def parseToken(self, token):
- if token == self.ignore_until:
+ if self.ignore_until == token:
return self.old_state
return self
- def parse_java(file):
- contents = open(file, 'r').read()
-
- # Is there a more efficient way to do this than to split
- # the contents like this?
- pkg_dir = None
- for line in string.split(contents, "\n"):
- if line[:7] == 'package':
- pkg = string.split(line)[1]
- if pkg[-1] == ';':
- pkg = pkg[:-1]
- pkg_dir = apply(os.path.join, string.split(pkg, '.'))
- break
+ class PackageState:
+ """The state we enter when we encounter the package keyword.
+ We assume the next token will be the package name."""
+ def __init__(self, outer_state):
+ # outer_state is always an instance of OuterState
+ self.outer_state = outer_state
+ def parseToken(self, token):
+ self.outer_state.setPackage(token)
+ return self.outer_state
+ def parse_java(fn):
+ """Parse a .java file and return a double of package directory,
+ plus a list of .class files that compiling that .java file will
+ produce"""
+ package = None
initial = OuterState()
currstate = initial
- for matches in _reToken.findall(contents):
+ for token in _reToken.findall(open(fn, 'r').read()):
# The regex produces a bunch of groups, but only one will
# have anything in it.
- token = filter(lambda x: x, matches)[0]
currstate = currstate.parseToken(token)
-
- return pkg_dir, initial.listOutputs
+ if initial.package:
+ package = string.replace(initial.package, '.', os.sep)
+ return (package, initial.listOutputs)
else:
# Don't actually parse Java files for class names.