From: Stefan Behnel Date: Wed, 1 Aug 2007 23:43:48 +0000 (+0200) Subject: initial support for unicode literals in UTF-8 X-Git-Tag: 0.9.6.14~29^2~129^2~22 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=3f93988ff2f67c839330da4a8b14fa765ebadcf3;p=cython.git initial support for unicode literals in UTF-8 --- diff --git a/Cython/Compiler/Lexicon.py b/Cython/Compiler/Lexicon.py index 676f2036..89d7b43f 100644 --- a/Cython/Compiler/Lexicon.py +++ b/Cython/Compiler/Lexicon.py @@ -5,7 +5,7 @@ # to be rebuilt next time pyrexc is run. # -string_prefixes = "cCrR" +string_prefixes = "cCrRuU" def make_lexicon(): from Cython.Plex import \ diff --git a/Cython/Compiler/ModuleNode.py b/Cython/Compiler/ModuleNode.py index 4fd310fc..05945bb3 100644 --- a/Cython/Compiler/ModuleNode.py +++ b/Cython/Compiler/ModuleNode.py @@ -1071,13 +1071,16 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode): "static __Pyx_StringTabEntry %s[] = {" % Naming.stringtab_cname) for entry in entries: + print repr(entry.init), type(entry.init) code.putln( - "{&%s, %s, sizeof(%s)}," % ( + "{&%s, %s, sizeof(%s), %d}," % ( entry.pystring_cname, entry.cname, - entry.cname)) + entry.cname, + isinstance(entry.init, unicode) + )) code.putln( - "{0, 0, 0}") + "{0, 0, 0, 0}") code.putln( "};") diff --git a/Cython/Compiler/Nodes.py b/Cython/Compiler/Nodes.py index 92730120..19e62970 100644 --- a/Cython/Compiler/Nodes.py +++ b/Cython/Compiler/Nodes.py @@ -2600,7 +2600,7 @@ utility_function_predeclarations = \ typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/ typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/ -typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/ +typedef struct {PyObject **p; char *s; long n; int is_unicode;} __Pyx_StringTabEntry; /*proto*/ #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) static INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { @@ -3104,7 +3104,11 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ """,""" static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { while (t->p) { - *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + if (t->is_unicode) { + *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL); + } else { + *t->p = PyString_FromStringAndSize(t->s, t->n - 1); + } if (!*t->p) return -1; ++t; diff --git a/Cython/Compiler/Parsing.py b/Cython/Compiler/Parsing.py index 9ff8c65b..cbe3126c 100644 --- a/Cython/Compiler/Parsing.py +++ b/Cython/Compiler/Parsing.py @@ -493,7 +493,7 @@ def p_opt_string_literal(s): def p_string_literal(s): # A single string or char literal. - # Returns (kind, value) where kind in ('', 'c', 'r') + # Returns (kind, value) where kind in ('', 'c', 'r', 'u') if s.sy == 'STRING': value = unquote(s.systring) s.next() @@ -502,7 +502,7 @@ def p_string_literal(s): pos = s.position() #is_raw = s.systring[:1].lower() == "r" kind = s.systring[:1].lower() - if kind not in "cr": + if kind not in "cru": kind = '' chars = [] while 1: @@ -513,6 +513,8 @@ def p_string_literal(s): systr = s.systring if len(systr) == 1 and systr in "'\"\n": chars.append('\\') + if kind == 'u' and not isinstance(systr, unicode): + systr = systr.decode("UTF-8") chars.append(systr) elif sy == 'ESCAPE': systr = s.systring @@ -533,6 +535,8 @@ def p_string_literal(s): chars.append('\\x0' + systr[2:]) elif c == '\n': pass + elif c == 'u': + chars.append(systr) else: chars.append(r'\\' + systr[1:]) elif sy == 'NEWLINE': @@ -546,7 +550,10 @@ def p_string_literal(s): "Unexpected token %r:%r in string literal" % (sy, s.systring)) s.next() - value = join(chars, '') + if kind == 'u': + value = u''.join(chars) + else: + value = ''.join(chars) #print "p_string_literal: value =", repr(value) ### return kind, value diff --git a/Cython/Compiler/PyrexTypes.py b/Cython/Compiler/PyrexTypes.py index b25fbc30..a7a6ca0c 100644 --- a/Cython/Compiler/PyrexTypes.py +++ b/Cython/Compiler/PyrexTypes.py @@ -705,6 +705,8 @@ class CStringType: from_py_function = "PyString_AsString" def literal_code(self, value): + if isinstance(value, unicode): + value = value.encode("UTF-8") return '"%s"' % value