From c2de390db4a646063949874533cb398e71750e98 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <scoder@users.berlios.de>
Date: Sat, 20 Mar 2010 19:51:51 +0100
Subject: [PATCH] fix universal newline parsing (which the codecs module
 doesn't support), use the fast 'io' module for file reading in Py2.6 and
 later

---
 Cython/Compiler/Scanning.py | 10 +++----
 Cython/Utils.py             | 56 ++++++++++++++++++++++++++++++++++---
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py
index 98120f0b..a0b01706 100644
--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor):
         self._cmp_name = filename
     
     def get_lines(self, encoding=None, error_handling=None):
-        if not encoding:
-            return Utils.open_source_file(self.filename)
-        else:
-            return codecs.open(self.filename, "rU", encoding=encoding,
-                               errors=error_handling)
+        return Utils.open_source_file(
+            self.filename, encoding=encoding,
+            error_handling=error_handling,
+            # newline normalisation is costly before Py2.6
+            require_normalised_newlines=False)
     
     def get_description(self):
         return self.filename
diff --git a/Cython/Utils.py b/Cython/Utils.py
index dfb4cf3a..a81d0f50 100644
--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search
 
 def detect_file_encoding(source_filename):
     # PEPs 263 and 3120
-    f = codecs.open(source_filename, "rU", encoding="UTF-8")
+    f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore')
     try:
         chars = []
         for i in range(2):
@@ -78,9 +78,57 @@ def detect_file_encoding(source_filename):
         f.close()
     return "UTF-8"
 
-def open_source_file(source_filename, mode="rU"):
-    encoding = detect_file_encoding(source_filename)
-    return codecs.open(source_filename, mode=mode, encoding=encoding)
+normalise_newlines = re.compile(u'\r\n?|\n').sub
+
+class NormalisedNewlineStream(object):
+  """The codecs module doesn't provide universal newline support.
+  This class is used as a stream wrapper that provides this
+  functionality.  The new 'io' in Py2.6+/3.1+ supports this out of the
+  box.
+  """
+  def __init__(self, stream):
+    # let's assume .read() doesn't change
+    self._read = stream.read
+    self.close = stream.close
+    self.encoding = getattr(stream, 'encoding', 'UTF-8')
+
+  def read(self, count):
+    data = self._read(count)
+    if u'\r' not in data:
+      return data
+    if data.endswith(u'\r'):
+      # may be missing a '\n'
+      data += self._read(1)
+    return normalise_newlines(u'\n', data)
+
+  def readlines(self):
+    content = []
+    data = self._read(0x1000)
+    while data:
+        content.append(data)
+        data = self._read(0x1000)
+    return u''.join(content).split(u'\n')
+
+try:
+    from io import open as io_open
+except ImportError:
+    io_open = None
+
+def open_source_file(source_filename, mode="r",
+                     encoding=None, error_handling=None,
+                     require_normalised_newlines=True):
+    if encoding is None:
+        encoding = detect_file_encoding(source_filename)
+    if io_open is not None:
+        return io_open(source_filename, mode=mode,
+                       encoding=encoding, errors=error_handling)
+    else:
+        # codecs module doesn't have universal newline support
+        stream = codecs.open(source_filename, mode=mode,
+                             encoding=encoding, errors=error_handling)
+        if require_normalised_newlines:
+            stream = NormalisedNewlineStream(stream)
+        return stream
 
 def long_literal(value):
     if isinstance(value, basestring):
-- 
2.26.2