robustness against unicode errors on encoding detection
authorStefan Behnel <scoder@users.berlios.de>
Wed, 30 Apr 2008 21:42:09 +0000 (23:42 +0200)
committerStefan Behnel <scoder@users.berlios.de>
Wed, 30 Apr 2008 21:42:09 +0000 (23:42 +0200)
Cython/Compiler/Main.py
Cython/Utils.py

index 22da73f43026dca1ac889853a24743373ec436a9..f07f528cb240395b11ff7088f27035474801b1b4 100644 (file)
@@ -139,25 +139,25 @@ class Context:
 
     def parse(self, source_filename, type_names, pxd, full_module_name):
         # Parse the given source file and return a parse tree.
-        f = Utils.open_source_file(source_filename, "rU")
-
-        if isinstance(source_filename, unicode):
-            name = source_filename
-        else:
-            filename_encoding = sys.getfilesystemencoding()
-            if filename_encoding is None:
-                filename_encoding = sys.getdefaultencoding()
-            name = source_filename.decode(filename_encoding)
-
         try:
+            f = Utils.open_source_file(source_filename, "rU")
+
             try:
-                s = PyrexScanner(f, name, source_encoding = f.encoding,
-                                 type_names = type_names, context = self)
-                tree = Parsing.p_module(s, pxd, full_module_name)
-            except UnicodeDecodeError, msg:
-                error((name, 0, 0), "Decoding error, missing or incorrect coding=<encoding-name> at top of source (%s)" % msg)
-        finally:
-            f.close()
+                if isinstance(source_filename, unicode):
+                    name = source_filename
+                else:
+                    filename_encoding = sys.getfilesystemencoding()
+                    if filename_encoding is None:
+                        filename_encoding = sys.getdefaultencoding()
+                    name = source_filename.decode(filename_encoding)
+
+                    s = PyrexScanner(f, name, source_encoding = f.encoding,
+                                     type_names = type_names, context = self)
+                    tree = Parsing.p_module(s, pxd, full_module_name)
+            finally:
+                f.close()
+        except UnicodeDecodeError, msg:
+            error((source_filename, 0, 0), "Decoding error, missing or incorrect coding=<encoding-name> at top of source (%s)" % msg)
         if Errors.num_errors > 0:
             raise CompileError
         return tree
index 342f09bfc7a901aee44c1430eea6b0264bc0280a..a6fd0577193889fc48bc1927673f58d2cd755f8e 100644 (file)
@@ -41,12 +41,15 @@ def detect_file_encoding(source_filename):
     # PEPs 263 and 3120
     f = codecs.open(source_filename, "rU", encoding="UTF-8")
     try:
-        for line_no, line in enumerate(f):
-            encoding = _match_file_encoding(line)
+        chars = []
+        for i in range(2):
+            c = f.read(1)
+            while c and c != '\n':
+                chars.append(c)
+                c = f.read(1)
+            encoding = _match_file_encoding(u''.join(chars))
             if encoding:
                 return encoding.group(1)
-            if line_no == 1:
-                break
     finally:
         f.close()
     return "UTF-8"