From 01be3445640ea840cb44d1c11394d143f2eeb1db Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 22 Aug 2008 09:05:07 +0200 Subject: [PATCH] added .pxd for PyUnicode C-API to Cython/Includes/ --- Cython/Includes/python.pxd | 1 + Cython/Includes/python_unicode.pxd | 385 +++++++++++++++++++++++++++++ 2 files changed, 386 insertions(+) create mode 100644 Cython/Includes/python_unicode.pxd diff --git a/Cython/Includes/python.pxd b/Cython/Includes/python.pxd index 9559db04..700318ff 100644 --- a/Cython/Includes/python.pxd +++ b/Cython/Includes/python.pxd @@ -136,6 +136,7 @@ from python_long cimport * from python_float cimport * from python_complex cimport * from python_string cimport * +from python_unicode cimport * from python_dict cimport * from python_instance cimport * from python_function cimport * diff --git a/Cython/Includes/python_unicode.pxd b/Cython/Includes/python_unicode.pxd new file mode 100644 index 00000000..0ba7eca3 --- /dev/null +++ b/Cython/Includes/python_unicode.pxd @@ -0,0 +1,385 @@ +cdef extern from *: + ctypedef int Py_UNICODE + + # Return true if the object o is a Unicode object or an instance + # of a Unicode subtype. Changed in version 2.2: Allowed subtypes + # to be accepted. + bint PyUnicode_Check(object o) + + # Return true if the object o is a Unicode object, but not an + # instance of a subtype. New in version 2.2. + bint PyUnicode_CheckExact(object o) + + # Return the size of the object. o has to be a PyUnicodeObject + # (not checked). + Py_ssize_t PyUnicode_GET_SIZE(object o) + + # Return the size of the object's internal buffer in bytes. o has + # to be a PyUnicodeObject (not checked). + Py_ssize_t PyUnicode_GET_DATA_SIZE(object o) + + # Return a pointer to the internal Py_UNICODE buffer of the + # object. o has to be a PyUnicodeObject (not checked). + Py_UNICODE* PyUnicode_AS_UNICODE(object o) + + # Return a pointer to the internal buffer of the object. o has to + # be a PyUnicodeObject (not checked). + char* PyUnicode_AS_DATA(object o) + + # Return 1 or 0 depending on whether ch is a whitespace character. + bint Py_UNICODE_ISSPACE(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a lowercase character. + bint Py_UNICODE_ISLOWER(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is an uppercase character. + bint Py_UNICODE_ISUPPER(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a titlecase character. + bint Py_UNICODE_ISTITLE(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a linebreak character. + bint Py_UNICODE_ISLINEBREAK(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a decimal character. + bint Py_UNICODE_ISDECIMAL(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a digit character. + bint Py_UNICODE_ISDIGIT(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is a numeric character. + bint Py_UNICODE_ISNUMERIC(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is an alphabetic character. + bint Py_UNICODE_ISALPHA(Py_UNICODE ch) + + # Return 1 or 0 depending on whether ch is an alphanumeric character. + bint Py_UNICODE_ISALNUM(Py_UNICODE ch) + + # Return the character ch converted to lower case. + Py_UNICODE Py_UNICODE_TOLOWER(Py_UNICODE ch) + + # Return the character ch converted to upper case. + Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch) + + # Return the character ch converted to title case. + Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch) + + # Return the character ch converted to a decimal positive + # integer. Return -1 if this is not possible. This macro does not + # raise exceptions. + int Py_UNICODE_TODECIMAL(Py_UNICODE ch) + + # Return the character ch converted to a single digit + # integer. Return -1 if this is not possible. This macro does not + # raise exceptions. + int Py_UNICODE_TODIGIT(Py_UNICODE ch) + + # Return the character ch converted to a double. Return -1.0 if + # this is not possible. This macro does not raise exceptions. + double Py_UNICODE_TONUMERIC(Py_UNICODE ch) + + # To create Unicode objects and access their basic sequence + # properties, use these APIs: + + # Create a Unicode Object from the Py_UNICODE buffer u of the + # given size. u may be NULL which causes the contents to be + # undefined. It is the user's responsibility to fill in the needed + # data. The buffer is copied into the new object. If the buffer is + # not NULL, the return value might be a shared object. Therefore, + # modification of the resulting Unicode object is only allowed + # when u is NULL. + object PyUnicode_FromUnicode(Py_UNICODE *u, Py_ssize_t size) + + # Return a read-only pointer to the Unicode object's internal + # Py_UNICODE buffer, NULL if unicode is not a Unicode object. + Py_UNICODE* PyUnicode_AsUnicode(object o) + + # Return the length of the Unicode object. + Py_ssize_t PyUnicode_GetSize(object o) + + # Coerce an encoded object obj to an Unicode object and return a + # reference with incremented refcount. + # String and other char buffer compatible objects are decoded + # according to the given encoding and using the error handling + # defined by errors. Both can be NULL to have the interface use + # the default values (see the next section for details). + # All other objects, including Unicode objects, cause a TypeError + # to be set. + object PyUnicode_FromEncodedObject(object o, char *encoding, char *errors) + + # Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict") + # which is used throughout the interpreter whenever coercion to + # Unicode is needed. + object PyUnicode_FromObject(object obj) + + # If the platform supports wchar_t and provides a header file + # wchar.h, Python can interface directly to this type using the + # following functions. Support is optimized if Python's own + # Py_UNICODE type is identical to the system's wchar_t. + + #ctypedef int wchar_t + + # Create a Unicode object from the wchar_t buffer w of the given + # size. Return NULL on failure. + #PyObject* PyUnicode_FromWideChar(wchar_t *w, Py_ssize_t size) + + #Py_ssize_t PyUnicode_AsWideChar(object o, wchar_t *w, Py_ssize_t size) + +# Codecs + + # Create a Unicode object by decoding size bytes of the encoded + # string s. encoding and errors have the same meaning as the + # parameters of the same name in the unicode() builtin + # function. The codec to be used is looked up using the Python + # codec registry. Return NULL if an exception was raised by the + # codec. + object PyUnicode_Decode(char *s, Py_ssize_t size, char *encoding, char *errors) + + # Encode the Py_UNICODE buffer of the given size and return a + # Python string object. encoding and errors have the same meaning + # as the parameters of the same name in the Unicode encode() + # method. The codec to be used is looked up using the Python codec + # registry. Return NULL if an exception was raised by the codec. + object PyUnicode_Encode(Py_UNICODE *s, Py_ssize_t size, + char *encoding, char *errors) + + # Encode a Unicode object and return the result as Python string + # object. encoding and errors have the same meaning as the + # parameters of the same name in the Unicode encode() method. The + # codec to be used is looked up using the Python codec + # registry. Return NULL if an exception was raised by the codec. + object PyUnicode_AsEncodedString(object unicode, char *encoding, char *errors) + +# These are the UTF-8 codec APIs: + + # Create a Unicode object by decoding size bytes of the UTF-8 + # encoded string s. Return NULL if an exception was raised by the + # codec. + object PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors) + + # If consumed is NULL, behave like PyUnicode_DecodeUTF8(). If + # consumed is not NULL, trailing incomplete UTF-8 byte sequences + # will not be treated as an error. Those bytes will not be decoded + # and the number of bytes that have been decoded will be stored in + # consumed. New in version 2.4. + object PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed) + + # Encode the Py_UNICODE buffer of the given size using UTF-8 and + # return a Python string object. Return NULL if an exception was + # raised by the codec. + object PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors) + + # Encode a Unicode objects using UTF-8 and return the result as Python string object. Error handling is ``strict''. Return NULL if an exception was raised by the codec. + object PyUnicode_AsUTF8String(object unicode) + +# These are the UTF-16 codec APIs: + + # Decode length bytes from a UTF-16 encoded buffer string and + # return the corresponding Unicode object. errors (if non-NULL) + # defines the error handling. It defaults to ``strict''. + # + # If byteorder is non-NULL, the decoder starts decoding using the + # given byte order: + # + # *byteorder == -1: little endian + # *byteorder == 0: native order + # *byteorder == 1: big endian + # + # and then switches if the first two bytes of the input data are a + # byte order mark (BOM) and the specified byte order is native + # order. This BOM is not copied into the resulting Unicode + # string. After completion, *byteorder is set to the current byte + # order at the. + # + # If byteorder is NULL, the codec starts in native order mode. + object PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder) + + # If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If + # consumed is not NULL, PyUnicode_DecodeUTF16Stateful() will not + # treat trailing incomplete UTF-16 byte sequences (such as an odd + # number of bytes or a split surrogate pair) as an error. Those + # bytes will not be decoded and the number of bytes that have been + # decoded will be stored in consumed. New in version 2.4. + object PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed) + + # Return a Python string object holding the UTF-16 encoded value + # of the Unicode data in s. If byteorder is not 0, output is + # written according to the following byte order: + # + # byteorder == -1: little endian + # byteorder == 0: native byte order (writes a BOM mark) + # byteorder == 1: big endian + # + # If byteorder is 0, the output string will always start with the + # Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark + # is prepended. + # + # If Py_UNICODE_WIDE is defined, a single Py_UNICODE value may get + # represented as a surrogate pair. If it is not defined, each + # Py_UNICODE values is interpreted as an UCS-2 character. + object PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder) + + # Return a Python string using the UTF-16 encoding in native byte + # order. The string always starts with a BOM mark. Error handling + # is ``strict''. Return NULL if an exception was raised by the + # codec. + object PyUnicode_AsUTF16String(object unicode) + +# These are the ``Unicode Escape'' codec APIs: + + # Create a Unicode object by decoding size bytes of the + # Unicode-Escape encoded string s. Return NULL if an exception was + # raised by the codec. + object PyUnicode_DecodeUnicodeEscape(char *s, Py_ssize_t size, char *errors) + + # Encode the Py_UNICODE buffer of the given size using + # Unicode-Escape and return a Python string object. Return NULL if + # an exception was raised by the codec. + object PyUnicode_EncodeUnicodeEscape(Py_UNICODE *s, Py_ssize_t size) + + # Encode a Unicode objects using Unicode-Escape and return the + # result as Python string object. Error handling is + # ``strict''. Return NULL if an exception was raised by the codec. + object PyUnicode_AsUnicodeEscapeString(object unicode) + +# These are the ``Raw Unicode Escape'' codec APIs: + + # Create a Unicode object by decoding size bytes of the + # Raw-Unicode-Escape encoded string s. Return NULL if an exception + # was raised by the codec. + object PyUnicode_DecodeRawUnicodeEscape(char *s, Py_ssize_t size, char *errors) + + # Encode the Py_UNICODE buffer of the given size using + # Raw-Unicode-Escape and return a Python string object. Return + # NULL if an exception was raised by the codec. + object PyUnicode_EncodeRawUnicodeEscape(Py_UNICODE *s, Py_ssize_t size, char *errors) + + # Encode a Unicode objects using Raw-Unicode-Escape and return the + # result as Python string object. Error handling is + # ``strict''. Return NULL if an exception was raised by the codec. + object PyUnicode_AsRawUnicodeEscapeString(object unicode) + +# These are the Latin-1 codec APIs: Latin-1 corresponds to the first 256 Unicode ordinals and only these are accepted by the codecs during encoding. + + # Create a Unicode object by decoding size bytes of the Latin-1 + # encoded string s. Return NULL if an exception was raised by the + # codec. + object PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors) + + # Encode the Py_UNICODE buffer of the given size using Latin-1 and + # return a Python string object. Return NULL if an exception was + # raised by the codec. + object PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors) + + # Encode a Unicode objects using Latin-1 and return the result as + # Python string object. Error handling is ``strict''. Return NULL + # if an exception was raised by the codec. + object PyUnicode_AsLatin1String(object unicode) + +# These are the ASCII codec APIs. Only 7-bit ASCII data is +# accepted. All other codes generate errors. + + # Create a Unicode object by decoding size bytes of the ASCII + # encoded string s. Return NULL if an exception was raised by the + # codec. + object PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors) + + # Encode the Py_UNICODE buffer of the given size using ASCII and + # return a Python string object. Return NULL if an exception was + # raised by the codec. + object PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors) + + # Encode a Unicode objects using ASCII and return the result as + # Python string object. Error handling is ``strict''. Return NULL + # if an exception was raised by the codec. + object PyUnicode_AsASCIIString(object o) + +# These are the mapping codec APIs: +# +# This codec is special in that it can be used to implement many +# different codecs (and this is in fact what was done to obtain most +# of the standard codecs included in the encodings package). The codec +# uses mapping to encode and decode characters. +# +# Decoding mappings must map single string characters to single +# Unicode characters, integers (which are then interpreted as Unicode +# ordinals) or None (meaning "undefined mapping" and causing an +# error). +# +# Encoding mappings must map single Unicode characters to single +# string characters, integers (which are then interpreted as Latin-1 +# ordinals) or None (meaning "undefined mapping" and causing an +# error). +# +# The mapping objects provided must only support the __getitem__ +# mapping interface. +# +# If a character lookup fails with a LookupError, the character is +# copied as-is meaning that its ordinal value will be interpreted as +# Unicode or Latin-1 ordinal resp. Because of this, mappings only need +# to contain those mappings which map characters to different code +# points. + + # Create a Unicode object by decoding size bytes of the encoded + # string s using the given mapping object. Return NULL if an + # exception was raised by the codec. If mapping is NULL latin-1 + # decoding will be done. Else it can be a dictionary mapping byte + # or a unicode string, which is treated as a lookup table. Byte + # values greater that the length of the string and U+FFFE + # "characters" are treated as "undefined mapping". Changed in + # version 2.4: Allowed unicode string as mapping argument. + object PyUnicode_DecodeCharmap(char *s, Py_ssize_t size, object mapping, char *errors) + + # Encode the Py_UNICODE buffer of the given size using the given + # mapping object and return a Python string object. Return NULL if + # an exception was raised by the codec. + object PyUnicode_EncodeCharmap(Py_UNICODE *s, Py_ssize_t size, object mapping, char *errors) + + # Encode a Unicode objects using the given mapping object and + # return the result as Python string object. Error handling is + # ``strict''. Return NULL if an exception was raised by the codec. + object PyUnicode_AsCharmapString(object o, object mapping) + +# The following codec API is special in that maps Unicode to Unicode. + + # Translate a Py_UNICODE buffer of the given length by applying a + # character mapping table to it and return the resulting Unicode + # object. Return NULL when an exception was raised by the codec. + # + # The mapping table must map Unicode ordinal integers to Unicode + # ordinal integers or None (causing deletion of the character). + # + # Mapping tables need only provide the __getitem__() interface; + # dictionaries and sequences work well. Unmapped character + # ordinals (ones which cause a LookupError) are left untouched and + # are copied as-is. + object PyUnicode_TranslateCharmap(Py_UNICODE *s, Py_ssize_t size, + object table, char *errors) + +# These are the MBCS codec APIs. They are currently only available on +# Windows and use the Win32 MBCS converters to implement the +# conversions. Note that MBCS (or DBCS) is a class of encodings, not +# just one. The target encoding is defined by the user settings on the +# machine running the codec. + + # Create a Unicode object by decoding size bytes of the MBCS + # encoded string s. Return NULL if an exception was raised by the + # codec. + object PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors) + + # If consumed is NULL, behave like PyUnicode_DecodeMBCS(). If + # consumed is not NULL, PyUnicode_DecodeMBCSStateful() will not + # decode trailing lead byte and the number of bytes that have been + # decoded will be stored in consumed. New in version 2.5. + object PyUnicode_DecodeMBCSStateful(char *s, int size, char *errors, int *consumed) + + # Encode the Py_UNICODE buffer of the given size using MBCS and + # return a Python string object. Return NULL if an exception was + # raised by the codec. + object PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors) + + # Encode a Unicode objects using MBCS and return the result as + # Python string object. Error handling is ``strict''. Return NULL + # if an exception was raised by the codec. + object PyUnicode_AsMBCSString(object o) -- 2.26.2