fix universal newline parsing (which the codecs module doesn't support), use the...

author Stefan Behnel <scoder@users.berlios.de>

Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)

committer Stefan Behnel <scoder@users.berlios.de>

Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
author Stefan Behnel <scoder@users.berlios.de>
Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
committer Stefan Behnel <scoder@users.berlios.de>
Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py

index 98120f0b1fc828b1b45ceb58b92fc77efe020fb1..a0b017060f99ead13efbe598cd3cfc1c359162cd 100644 (file)
--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
          self._cmp_name = filename
      
      def get_lines(self, encoding=None, error_handling=None):
-        if not encoding:
-            return Utils.open_source_file(self.filename)
-        else:
-            return codecs.open(self.filename, "rU", encoding=encoding,
-                               errors=error_handling)
+        return Utils.open_source_file(
+            self.filename, encoding=encoding,
+            error_handling=error_handling,
+            # newline normalisation is costly before Py2.6
+            require_normalised_newlines=False)
      
      def get_description(self):
          return self.filename
diff --git a/Cython/Utils.py b/Cython/Utils.py

index dfb4cf3a7a42d03a8165f3c4061f46766f8e34b3..a81d0f502f22cf5e3d7708a06dbd7c7f1a30b8e8 100644 (file)
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
  
  def detect_file_encoding(source_filename):
      # PEPs 263 and 3120
-    f = codecs.open(source_filename, "rU", encoding="UTF-8")
+    f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
      try:
          chars = []
          for i in range(2):
@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
          f.close()
      return "UTF-8"
  
-def open_source_file(source_filename, mode="rU"):
-    encoding = detect_file_encoding(source_filename)
-    return codecs.open(source_filename, mode=mode, encoding=encoding)
+normalise_newlines = re.compile(u'\r\n?|\n').sub
+
+class NormalisedNewlineStream(object):
+  """The codecs module doesn't provide universal newline support.
+  This class is used as a stream wrapper that provides this
+  functionality.  The new 'io' in Py2.6+/3.1+ supports this out of the
+  box.
+  """
+  def __init__(self, stream):
+    # let's assume .read() doesn't change
+    self._read = stream.read
+    self.close = stream.close
+    self.encoding = getattr(stream, 'encoding', 'UTF-8')
+
+  def read(self, count):
+    data = self._read(count)
+    if u'\r' not in data:
+      return data
+    if data.endswith(u'\r'):
+      # may be missing a '\n'
+      data += self._read(1)
+    return normalise_newlines(u'\n', data)
+
+  def readlines(self):
+    content = []
+    data = self._read(0x1000)
+    while data:
+        content.append(data)
+        data = self._read(0x1000)
+    return u''.join(content).split(u'\n')
+
+try:
+    from io import open as io_open
+except ImportError:
+    io_open = None
+
+def open_source_file(source_filename, mode="r",
+                     encoding=None, error_handling=None,
+                     require_normalised_newlines=True):
+    if encoding is None:
+        encoding = detect_file_encoding(source_filename)
+    if io_open is not None:
+        return io_open(source_filename, mode=mode,
+                       encoding=encoding, errors=error_handling)
+    else:
+        # codecs module doesn't have universal newline support
+        stream = codecs.open(source_filename, mode=mode,
+                             encoding=encoding, errors=error_handling)
+        if require_normalised_newlines:
+            stream = NormalisedNewlineStream(stream)
+        return stream
  
  def long_literal(value):
      if isinstance(value, basestring):
author	Stefan Behnel <scoder@users.berlios.de>
	Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
committer	Stefan Behnel <scoder@users.berlios.de>
	Sat, 20 Mar 2010 18:51:51 +0000 (19:51 +0100)
Cython/Compiler/Scanning.py		patch \| blob \| history
Cython/Utils.py		patch \| blob \| history