fix universal newline parsing (which the codecs module doesn't support), use the...
authorStefan Behnel <scoder@users.berlios.de>
Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
committerStefan Behnel <scoder@users.berlios.de>
Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
Cython/Compiler/Scanning.py
Cython/Utils.py

index 98120f0b1fc828b1b45ceb58b92fc77efe020fb1..a0b017060f99ead13efbe598cd3cfc1c359162cd 100644 (file)
@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
         self._cmp_name = filename
     
     def get_lines(self, encoding=None, error_handling=None):
-        if not encoding:
-            return Utils.open_source_file(self.filename)
-        else:
-            return codecs.open(self.filename, "rU", encoding=encoding,
-                               errors=error_handling)
+        return Utils.open_source_file(
+            self.filename, encoding=encoding,
+            error_handling=error_handling,
+            # newline normalisation is costly before Py2.6
+            require_normalised_newlines=False)
     
     def get_description(self):
         return self.filename
index dfb4cf3a7a42d03a8165f3c4061f46766f8e34b3..a81d0f502f22cf5e3d7708a06dbd7c7f1a30b8e8 100644 (file)
@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
 
 def detect_file_encoding(source_filename):
     # PEPs 263 and 3120
-    f = codecs.open(source_filename, "rU", encoding="UTF-8")
+    f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
     try:
         chars = []
         for i in range(2):
@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
         f.close()
     return "UTF-8"
 
-def open_source_file(source_filename, mode="rU"):
-    encoding = detect_file_encoding(source_filename)
-    return codecs.open(source_filename, mode=mode, encoding=encoding)
+normalise_newlines = re.compile(u'\r\n?|\n').sub
+
+class NormalisedNewlineStream(object):
+  """The codecs module doesn't provide universal newline support.
+  This class is used as a stream wrapper that provides this
+  functionality.  The new 'io' in Py2.6+/3.1+ supports this out of the
+  box.
+  """
+  def __init__(self, stream):
+    # let's assume .read() doesn't change
+    self._read = stream.read
+    self.close = stream.close
+    self.encoding = getattr(stream, 'encoding', 'UTF-8')
+
+  def read(self, count):
+    data = self._read(count)
+    if u'\r' not in data:
+      return data
+    if data.endswith(u'\r'):
+      # may be missing a '\n'
+      data += self._read(1)
+    return normalise_newlines(u'\n', data)
+
+  def readlines(self):
+    content = []
+    data = self._read(0x1000)
+    while data:
+        content.append(data)
+        data = self._read(0x1000)
+    return u''.join(content).split(u'\n')
+
+try:
+    from io import open as io_open
+except ImportError:
+    io_open = None
+
+def open_source_file(source_filename, mode="r",
+                     encoding=None, error_handling=None,
+                     require_normalised_newlines=True):
+    if encoding is None:
+        encoding = detect_file_encoding(source_filename)
+    if io_open is not None:
+        return io_open(source_filename, mode=mode,
+                       encoding=encoding, errors=error_handling)
+    else:
+        # codecs module doesn't have universal newline support
+        stream = codecs.open(source_filename, mode=mode,
+                             encoding=encoding, errors=error_handling)
+        if require_normalised_newlines:
+            stream = NormalisedNewlineStream(stream)
+        return stream
 
 def long_literal(value):
     if isinstance(value, basestring):