From c2de390db4a646063949874533cb398e71750e98 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 20 Mar 2010 19:51:51 +0100 Subject: [PATCH] fix universal newline parsing (which the codecs module doesn't support), use the fast 'io' module for file reading in Py2.6 and later --- Cython/Compiler/Scanning.py | 10 +++---- Cython/Utils.py | 56 ++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/Cython/Compiler/Scanning.py b/Cython/Compiler/Scanning.py index 98120f0b..a0b01706 100644 --- a/Cython/Compiler/Scanning.py +++ b/Cython/Compiler/Scanning.py @@ -174,11 +174,11 @@ class FileSourceDescriptor(SourceDescriptor): self._cmp_name = filename def get_lines(self, encoding=None, error_handling=None): - if not encoding: - return Utils.open_source_file(self.filename) - else: - return codecs.open(self.filename, "rU", encoding=encoding, - errors=error_handling) + return Utils.open_source_file( + self.filename, encoding=encoding, + error_handling=error_handling, + # newline normalisation is costly before Py2.6 + require_normalised_newlines=False) def get_description(self): return self.filename diff --git a/Cython/Utils.py b/Cython/Utils.py index dfb4cf3a..a81d0f50 100644 --- a/Cython/Utils.py +++ b/Cython/Utils.py @@ -63,7 +63,7 @@ _match_file_encoding = re.compile(u"coding[:=]\s*([-\w.]+)").search def detect_file_encoding(source_filename): # PEPs 263 and 3120 - f = codecs.open(source_filename, "rU", encoding="UTF-8") + f = open_source_file(source_filename, encoding="UTF-8", error_handling='ignore') try: chars = [] for i in range(2): @@ -78,9 +78,57 @@ def detect_file_encoding(source_filename): f.close() return "UTF-8" -def open_source_file(source_filename, mode="rU"): - encoding = detect_file_encoding(source_filename) - return codecs.open(source_filename, mode=mode, encoding=encoding) +normalise_newlines = re.compile(u'\r\n?|\n').sub + +class NormalisedNewlineStream(object): + """The codecs module doesn't provide universal newline support. + This class is used as a stream wrapper that provides this + functionality. The new 'io' in Py2.6+/3.1+ supports this out of the + box. + """ + def __init__(self, stream): + # let's assume .read() doesn't change + self._read = stream.read + self.close = stream.close + self.encoding = getattr(stream, 'encoding', 'UTF-8') + + def read(self, count): + data = self._read(count) + if u'\r' not in data: + return data + if data.endswith(u'\r'): + # may be missing a '\n' + data += self._read(1) + return normalise_newlines(u'\n', data) + + def readlines(self): + content = [] + data = self._read(0x1000) + while data: + content.append(data) + data = self._read(0x1000) + return u''.join(content).split(u'\n') + +try: + from io import open as io_open +except ImportError: + io_open = None + +def open_source_file(source_filename, mode="r", + encoding=None, error_handling=None, + require_normalised_newlines=True): + if encoding is None: + encoding = detect_file_encoding(source_filename) + if io_open is not None: + return io_open(source_filename, mode=mode, + encoding=encoding, errors=error_handling) + else: + # codecs module doesn't have universal newline support + stream = codecs.open(source_filename, mode=mode, + encoding=encoding, errors=error_handling) + if require_normalised_newlines: + stream = NormalisedNewlineStream(stream) + return stream def long_literal(value): if isinstance(value, basestring): -- 2.26.2