Issue 2255: Handle scanning of UTF-8 and UTF-16 files. (Greg Spencer)

[scons.git] / src / engine / SCons / Scanner / Fortran.py
diff --git a/src/engine/SCons/Scanner/Fortran.py b/src/engine/SCons/Scanner/Fortran.py

index 5d908f71e5bbbfc892ce061e645d81206533acfe..d2358ba7053b75b81166f85a7d159db35390b0fa 100644 (file)
--- a/src/engine/SCons/Scanner/Fortran.py
+++ b/src/engine/SCons/Scanner/Fortran.py
@@ -1,11 +1,11 @@
  """SCons.Scanner.Fortran
  
-This module implements the dependency scanner for Fortran code. 
+This module implements the dependency scanner for Fortran code.
  
  """
  
  #
-# Copyright (c) 2001, 2002 Steven Knight
+# __COPYRIGHT__
  #
  # Permission is hereby granted, free of charge, to any person obtaining
  # a copy of this software and associated documentation files (the
@@ -29,10 +29,8 @@ This module implements the dependency scanner for Fortran code.
  
  __revision__ = "__FILE__ __REVISION__ __DATE__ __DEVELOPER__"
  
-
-import copy
-import os.path
  import re
+import string
  
  import SCons.Node
  import SCons.Node.FS
@@ -40,93 +38,277 @@ import SCons.Scanner
  import SCons.Util
  import SCons.Warnings
  
-include_re = re.compile("INCLUDE[ \t]+'([\\w./\\\\]+)'", re.M)
-
-def FortranScan(fs = SCons.Node.FS.default_fs):
-    """Return a prototype Scanner instance for scanning source files
-    for Fortran INCLUDE statements"""
-    scanner = SCons.Scanner.Recursive(scan, "FortranScan", fs,
-                                      [".f", ".F", ".for", ".FOR"])
-    return scanner
+class F90Scanner(SCons.Scanner.Classic):
+    """
+    A Classic Scanner subclass for Fortran source files which takes
+    into account both USE and INCLUDE statements.  This scanner will
+    work for both F77 and F90 (and beyond) compilers.
  
-def scan(node, env, target, fs = SCons.Node.FS.default_fs):
+    Currently, this scanner assumes that the include files do not contain
+    USE statements.  To enable the ability to deal with USE statements
+    in include files, add logic right after the module names are found
+    to loop over each include file, search for and locate each USE
+    statement, and append each module name to the list of dependencies.
+    Caching the search results in a common dictionary somewhere so that
+    the same include file is not searched multiple times would be a
+    smart thing to do.
      """
-    scan(node, Environment) -> [node]
  
-    the Fortran dependency scanner function
+    def __init__(self, name, suffixes, path_variable,
+                 use_regex, incl_regex, def_regex, *args, **kw):
  
-    This function is intentionally simple. There are two rules it
-    follows:
-    
-    1) #include <foo.h> - search for foo.h in F77PATH followed by the
-        directory 'filename' is in
-    2) #include \"foo.h\" - search for foo.h in the directory 'filename' is
-       in followed by F77PATH
+        self.cre_use = re.compile(use_regex, re.M)
+        self.cre_incl = re.compile(incl_regex, re.M)
+        self.cre_def = re.compile(def_regex, re.M)
  
-    These rules approximate the behaviour of most C/C++ compilers.
+        def _scan(node, env, path, self=self):
+            node = node.rfile()
  
-    This scanner also ignores #ifdef and other preprocessor conditionals, so
-    it may find more depencies than there really are, but it never misses
-    dependencies.
-    """
+            if not node.exists():
+                return []
  
-    # This function caches various information in node and target:
-    # target.f77path - env['F77PATH'] converted to nodes
-    # node.found_includes - include files found by previous call to scan, 
-    #     keyed on f77path
-    # node.includes - the result of include_re.findall()
+            return self.scan(node, env, path)
  
-    if not hasattr(target, 'f77path'):
-        try:
-            target.f77path = tuple(fs.Rsearchall(SCons.Util.mapPaths(env['F77PATH'], target.cwd, env), clazz=SCons.Node.FS.Dir, must_exist=0))
-        except KeyError:
-            target.f77path = ()
+        kw['function'] = _scan
+        kw['path_function'] = SCons.Scanner.FindPathDirs(path_variable)
+        kw['recursive'] = 1
+        kw['skeys'] = suffixes
+        kw['name'] = name
  
-    f77path = target.f77path
+        apply(SCons.Scanner.Current.__init__, (self,) + args, kw)
  
-    nodes = []
+    def scan(self, node, env, path=()):
  
-    node = node.rfile()
-    try:
-        nodes = node.found_includes[f77path]
-    except KeyError:
-        if node.rexists():
+        # cache the includes list in node so we only scan it once:
+        if node.includes != None:
+            mods_and_includes = node.includes
+        else:
+            # retrieve all included filenames
+            includes = self.cre_incl.findall(node.get_text_contents())
+            # retrieve all USE'd module names
+            modules = self.cre_use.findall(node.get_text_contents())
+            # retrieve all defined module names
+            defmodules = self.cre_def.findall(node.get_text_contents())
  
-            # cache the includes list in node so we only scan it once:
-            if node.includes != None:
-                includes = node.includes
+            # Remove all USE'd module names that are defined in the same file
+            d = {}
+            for m in defmodules:
+                d[m] = 1
+            modules = filter(lambda m, d=d: not d.has_key(m), modules)
+            #modules = self.undefinedModules(modules, defmodules)
+
+            # Convert module name to a .mod filename
+            suffix = env.subst('$FORTRANMODSUFFIX')
+            modules = map(lambda x, s=suffix: string.lower(x) + s, modules)
+            # Remove unique items from the list
+            mods_and_includes = SCons.Util.unique(includes+modules)
+            node.includes = mods_and_includes
+
+        # This is a hand-coded DSU (decorate-sort-undecorate, or
+        # Schwartzian transform) pattern.  The sort key is the raw name
+        # of the file as specifed on the USE or INCLUDE line, which lets
+        # us keep the sort order constant regardless of whether the file
+        # is actually found in a Repository or locally.
+        nodes = []
+        source_dir = node.get_dir()
+        if callable(path):
+            path = path()
+        for dep in mods_and_includes:
+            n, i = self.find_include(dep, source_dir, path)
+
+            if n is None:
+                SCons.Warnings.warn(SCons.Warnings.DependencyWarning,
+                                    "No dependency generated for file: %s (referenced by: %s) -- file not found" % (i, node))
              else:
-                includes = include_re.findall(node.get_contents())
-                node.includes = includes
-
-            source_dir = node.get_dir()
-            
-            for include in includes:
-                n = SCons.Node.FS.find_file(include,
-                                            (source_dir,) + f77path,
-                                            fs.File)
-                if not n is None:
-                    nodes.append(n)
-                else:
-                    SCons.Warnings.warn(SCons.Warnings.DependencyWarning,
-                                        "No dependency generated for file: %s (included from: %s) -- file not found" % (include, node))
-        node.found_includes[f77path] = nodes
-
-    # Schwartzian transform from the Python FAQ Wizard
-    def st(List, Metric):
-        def pairing(element, M = Metric):
-            return (M(element), element)
-        def stripit(pair):
-            return pair[1]
-        paired = map(pairing, List)
-        paired.sort()
-        return map(stripit, paired)
-
-    def normalize(node):
-        # We don't want the order of includes to be 
-        # modified by case changes on case insensitive OSes, so
-        # normalize the case of the filename here:
-        # (see test/win32pathmadness.py for a test of this)
-        return SCons.Node.FS._my_normcase(str(node))
-
-    return st(nodes, normalize)
+                sortkey = self.sort_key(dep)
+                nodes.append((sortkey, n))
+
+        nodes.sort()
+        nodes = map(lambda pair: pair[1], nodes)
+        return nodes
+
+def FortranScan(path_variable="FORTRANPATH"):
+    """Return a prototype Scanner instance for scanning source files
+    for Fortran USE & INCLUDE statements"""
+
+#   The USE statement regex matches the following:
+#
+#   USE module_name
+#   USE :: module_name
+#   USE, INTRINSIC :: module_name
+#   USE, NON_INTRINSIC :: module_name
+#
+#   Limitations
+#
+#   --  While the regex can handle multiple USE statements on one line,
+#       it cannot properly handle them if they are commented out.
+#       In either of the following cases:
+#
+#            !  USE mod_a ; USE mod_b         [entire line is commented out]
+#               USE mod_a ! ; USE mod_b       [in-line comment of second USE statement]
+#
+#       the second module name (mod_b) will be picked up as a dependency
+#       even though it should be ignored.  The only way I can see
+#       to rectify this would be to modify the scanner to eliminate
+#       the call to re.findall, read in the contents of the file,
+#       treating the comment character as an end-of-line character
+#       in addition to the normal linefeed, loop over each line,
+#       weeding out the comments, and looking for the USE statements.
+#       One advantage to this is that the regex passed to the scanner
+#       would no longer need to match a semicolon.
+#
+#   --  I question whether or not we need to detect dependencies to
+#       INTRINSIC modules because these are built-in to the compiler.
+#       If we consider them a dependency, will SCons look for them, not
+#       find them, and kill the build?  Or will we there be standard
+#       compiler-specific directories we will need to point to so the
+#       compiler and SCons can locate the proper object and mod files?
+
+#   Here is a breakdown of the regex:
+#
+#   (?i)               : regex is case insensitive
+#   ^                  : start of line
+#   (?:                : group a collection of regex symbols without saving the match as a "group"
+#      ^|;             : matches either the start of the line or a semicolon - semicolon
+#   )                  : end the unsaved grouping
+#   \s*                : any amount of white space
+#   USE                : match the string USE, case insensitive
+#   (?:                : group a collection of regex symbols without saving the match as a "group"
+#      \s+|            : match one or more whitespace OR ....  (the next entire grouped set of regex symbols)
+#      (?:             : group a collection of regex symbols without saving the match as a "group"
+#         (?:          : establish another unsaved grouping of regex symbols
+#            \s*          : any amount of white space
+#            ,         : match a comma
+#            \s*       : any amount of white space
+#            (?:NON_)? : optionally match the prefix NON_, case insensitive
+#            INTRINSIC : match the string INTRINSIC, case insensitive
+#         )?           : optionally match the ", INTRINSIC/NON_INTRINSIC" grouped expression
+#         \s*          : any amount of white space
+#         ::           : match a double colon that must appear after the INTRINSIC/NON_INTRINSIC attribute
+#      )               : end the unsaved grouping
+#   )                  : end the unsaved grouping
+#   \s*                : match any amount of white space
+#   (\w+)              : match the module name that is being USE'd
+#
+#
+    use_regex = "(?i)(?:^|;)\s*USE(?:\s+|(?:(?:\s*,\s*(?:NON_)?INTRINSIC)?\s*::))\s*(\w+)"
+
+
+#   The INCLUDE statement regex matches the following:
+#
+#   INCLUDE 'some_Text'
+#   INCLUDE "some_Text"
+#   INCLUDE "some_Text" ; INCLUDE "some_Text"
+#   INCLUDE kind_"some_Text"
+#   INCLUDE kind_'some_Text"
+#
+#   where some_Text can include any alphanumeric and/or special character
+#   as defined by the Fortran 2003 standard.
+#
+#   Limitations:
+#
+#   --  The Fortran standard dictates that a " or ' in the INCLUDE'd
+#       string must be represented as a "" or '', if the quotes that wrap
+#       the entire string are either a ' or ", respectively.   While the
+#       regular expression below can detect the ' or " characters just fine,
+#       the scanning logic, presently is unable to detect them and reduce
+#       them to a single instance.  This probably isn't an issue since,
+#       in practice, ' or " are not generally used in filenames.
+#
+#   --  This regex will not properly deal with multiple INCLUDE statements
+#       when the entire line has been commented out, ala
+#
+#           ! INCLUDE 'some_file' ; INCLUDE 'some_file'
+#
+#       In such cases, it will properly ignore the first INCLUDE file,
+#       but will actually still pick up the second.  Interestingly enough,
+#       the regex will properly deal with these cases:
+#
+#             INCLUDE 'some_file'
+#             INCLUDE 'some_file' !; INCLUDE 'some_file'
+#
+#       To get around the above limitation, the FORTRAN programmer could
+#       simply comment each INCLUDE statement separately, like this
+#
+#           ! INCLUDE 'some_file' !; INCLUDE 'some_file'
+#
+#       The way I see it, the only way to get around this limitation would
+#       be to modify the scanning logic to replace the calls to re.findall
+#       with a custom loop that processes each line separately, throwing
+#       away fully commented out lines before attempting to match against
+#       the INCLUDE syntax.
+#
+#   Here is a breakdown of the regex:
+#
+#   (?i)               : regex is case insensitive
+#   (?:                : begin a non-saving group that matches the following:
+#      ^               :    either the start of the line
+#      |               :                or
+#      ['">]\s*;       :    a semicolon that follows a single quote,
+#                           double quote or greater than symbol (with any
+#                           amount of whitespace in between).  This will
+#                           allow the regex to match multiple INCLUDE
+#                           statements per line (although it also requires
+#                           the positive lookahead assertion that is
+#                           used below).  It will even properly deal with
+#                           (i.e. ignore) cases in which the additional
+#                           INCLUDES are part of an in-line comment, ala
+#                                           "  INCLUDE 'someFile' ! ; INCLUDE 'someFile2' "
+#   )                  : end of non-saving group
+#   \s*                : any amount of white space
+#   INCLUDE            : match the string INCLUDE, case insensitive
+#   \s+                : match one or more white space characters
+#   (?\w+_)?           : match the optional "kind-param _" prefix allowed by the standard
+#   [<"']              : match the include delimiter - an apostrophe, double quote, or less than symbol
+#   (.+?)              : match one or more characters that make up
+#                        the included path and file name and save it
+#                        in a group.  The Fortran standard allows for
+#                        any non-control character to be used.  The dot
+#                        operator will pick up any character, including
+#                        control codes, but I can't conceive of anyone
+#                        putting control codes in their file names.
+#                        The question mark indicates it is non-greedy so
+#                        that regex will match only up to the next quote,
+#                        double quote, or greater than symbol
+#   (?=["'>])          : positive lookahead assertion to match the include
+#                        delimiter - an apostrophe, double quote, or
+#                        greater than symbol.  This level of complexity
+#                        is required so that the include delimiter is
+#                        not consumed by the match, thus allowing the
+#                        sub-regex discussed above to uniquely match a
+#                        set of semicolon-separated INCLUDE statements
+#                        (as allowed by the F2003 standard)
+
+    include_regex = """(?i)(?:^|['">]\s*;)\s*INCLUDE\s+(?:\w+_)?[<"'](.+?)(?=["'>])"""
+
+#   The MODULE statement regex finds module definitions by matching
+#   the following:
+#
+#   MODULE module_name
+#
+#   but *not* the following:
+#
+#   MODULE PROCEDURE procedure_name
+#
+#   Here is a breakdown of the regex:
+#
+#   (?i)               : regex is case insensitive
+#   ^\s*               : any amount of white space
+#   MODULE             : match the string MODULE, case insensitive
+#   \s+                : match one or more white space characters
+#   (?!PROCEDURE)      : but *don't* match if the next word matches
+#                        PROCEDURE (negative lookahead assertion),
+#                        case insensitive
+#   (\w+)              : match one or more alphanumeric characters
+#                        that make up the defined module name and
+#                        save it in a group
+
+    def_regex = """(?i)^\s*MODULE\s+(?!PROCEDURE)(\w+)"""
+
+    scanner = F90Scanner("FortranScan",
+                         "$FORTRANSUFFIXES",
+                         path_variable,
+                         use_regex,
+                         include_regex,
+                         def_regex)
+    return scanner