| 1 | #ifndef Py_UNICODEOBJECT_H
|
|---|
| 2 | #define Py_UNICODEOBJECT_H
|
|---|
| 3 |
|
|---|
| 4 | /*
|
|---|
| 5 |
|
|---|
| 6 | Unicode implementation based on original code by Fredrik Lundh,
|
|---|
| 7 | modified by Marc-Andre Lemburg ([email protected]) according to the
|
|---|
| 8 | Unicode Integration Proposal (see file Misc/unicode.txt).
|
|---|
| 9 |
|
|---|
| 10 | Copyright (c) Corporation for National Research Initiatives.
|
|---|
| 11 |
|
|---|
| 12 |
|
|---|
| 13 | Original header:
|
|---|
| 14 | --------------------------------------------------------------------
|
|---|
| 15 |
|
|---|
| 16 | * Yet another Unicode string type for Python. This type supports the
|
|---|
| 17 | * 16-bit Basic Multilingual Plane (BMP) only.
|
|---|
| 18 | *
|
|---|
| 19 | * Written by Fredrik Lundh, January 1999.
|
|---|
| 20 | *
|
|---|
| 21 | * Copyright (c) 1999 by Secret Labs AB.
|
|---|
| 22 | * Copyright (c) 1999 by Fredrik Lundh.
|
|---|
| 23 | *
|
|---|
| 24 | * [email protected]
|
|---|
| 25 | * http://www.pythonware.com
|
|---|
| 26 | *
|
|---|
| 27 | * --------------------------------------------------------------------
|
|---|
| 28 | * This Unicode String Type is
|
|---|
| 29 | *
|
|---|
| 30 | * Copyright (c) 1999 by Secret Labs AB
|
|---|
| 31 | * Copyright (c) 1999 by Fredrik Lundh
|
|---|
| 32 | *
|
|---|
| 33 | * By obtaining, using, and/or copying this software and/or its
|
|---|
| 34 | * associated documentation, you agree that you have read, understood,
|
|---|
| 35 | * and will comply with the following terms and conditions:
|
|---|
| 36 | *
|
|---|
| 37 | * Permission to use, copy, modify, and distribute this software and its
|
|---|
| 38 | * associated documentation for any purpose and without fee is hereby
|
|---|
| 39 | * granted, provided that the above copyright notice appears in all
|
|---|
| 40 | * copies, and that both that copyright notice and this permission notice
|
|---|
| 41 | * appear in supporting documentation, and that the name of Secret Labs
|
|---|
| 42 | * AB or the author not be used in advertising or publicity pertaining to
|
|---|
| 43 | * distribution of the software without specific, written prior
|
|---|
| 44 | * permission.
|
|---|
| 45 | *
|
|---|
| 46 | * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
|
|---|
| 47 | * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|---|
| 48 | * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
|
|---|
| 49 | * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|---|
| 50 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|---|
| 51 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
|
|---|
| 52 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|---|
| 53 | * -------------------------------------------------------------------- */
|
|---|
| 54 |
|
|---|
| 55 | #include <ctype.h>
|
|---|
| 56 |
|
|---|
| 57 | /* === Internal API ======================================================= */
|
|---|
| 58 |
|
|---|
| 59 | /* --- Internal Unicode Format -------------------------------------------- */
|
|---|
| 60 |
|
|---|
| 61 | #ifndef Py_USING_UNICODE
|
|---|
| 62 |
|
|---|
| 63 | #define PyUnicode_Check(op) 0
|
|---|
| 64 | #define PyUnicode_CheckExact(op) 0
|
|---|
| 65 |
|
|---|
| 66 | #else
|
|---|
| 67 |
|
|---|
| 68 | /* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
|
|---|
| 69 | properly set, but the default rules below doesn't set it. I'll
|
|---|
| 70 | sort this out some other day -- [email protected] */
|
|---|
| 71 |
|
|---|
| 72 | #ifndef Py_UNICODE_SIZE
|
|---|
| 73 | #error Must define Py_UNICODE_SIZE
|
|---|
| 74 | #endif
|
|---|
| 75 |
|
|---|
| 76 | /* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
|
|---|
| 77 | strings are stored as UCS-2 (with limited support for UTF-16) */
|
|---|
| 78 |
|
|---|
| 79 | #if Py_UNICODE_SIZE >= 4
|
|---|
| 80 | #define Py_UNICODE_WIDE
|
|---|
| 81 | #endif
|
|---|
| 82 |
|
|---|
| 83 | /* Set these flags if the platform has "wchar.h", "wctype.h" and the
|
|---|
| 84 | wchar_t type is a 16-bit unsigned type */
|
|---|
| 85 | /* #define HAVE_WCHAR_H */
|
|---|
| 86 | /* #define HAVE_USABLE_WCHAR_T */
|
|---|
| 87 |
|
|---|
| 88 | /* Defaults for various platforms */
|
|---|
| 89 | #ifndef PY_UNICODE_TYPE
|
|---|
| 90 |
|
|---|
| 91 | /* Windows has a usable wchar_t type (unless we're using UCS-4) */
|
|---|
| 92 | # if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
|
|---|
| 93 | # define HAVE_USABLE_WCHAR_T
|
|---|
| 94 | # define PY_UNICODE_TYPE wchar_t
|
|---|
| 95 | # endif
|
|---|
| 96 |
|
|---|
| 97 | # if defined(Py_UNICODE_WIDE)
|
|---|
| 98 | # define PY_UNICODE_TYPE Py_UCS4
|
|---|
| 99 | # endif
|
|---|
| 100 |
|
|---|
| 101 | #endif
|
|---|
| 102 |
|
|---|
| 103 | /* If the compiler provides a wchar_t type we try to support it
|
|---|
| 104 | through the interface functions PyUnicode_FromWideChar() and
|
|---|
| 105 | PyUnicode_AsWideChar(). */
|
|---|
| 106 |
|
|---|
| 107 | #ifdef HAVE_USABLE_WCHAR_T
|
|---|
| 108 | # ifndef HAVE_WCHAR_H
|
|---|
| 109 | # define HAVE_WCHAR_H
|
|---|
| 110 | # endif
|
|---|
| 111 | #endif
|
|---|
| 112 |
|
|---|
| 113 | #ifdef HAVE_WCHAR_H
|
|---|
| 114 | /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
|
|---|
| 115 | # ifdef _HAVE_BSDI
|
|---|
| 116 | # include <time.h>
|
|---|
| 117 | # endif
|
|---|
| 118 | # include <wchar.h>
|
|---|
| 119 | #endif
|
|---|
| 120 |
|
|---|
| 121 | /*
|
|---|
| 122 | * Use this typedef when you need to represent a UTF-16 surrogate pair
|
|---|
| 123 | * as single unsigned integer.
|
|---|
| 124 | */
|
|---|
| 125 | #if SIZEOF_INT >= 4
|
|---|
| 126 | typedef unsigned int Py_UCS4;
|
|---|
| 127 | #elif SIZEOF_LONG >= 4
|
|---|
| 128 | typedef unsigned long Py_UCS4;
|
|---|
| 129 | #endif
|
|---|
| 130 |
|
|---|
| 131 | typedef PY_UNICODE_TYPE Py_UNICODE;
|
|---|
| 132 |
|
|---|
| 133 | /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
|
|---|
| 134 |
|
|---|
| 135 | /* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
|
|---|
| 136 | produce different external names and thus cause import errors in
|
|---|
| 137 | case Python interpreters and extensions with mixed compiled in
|
|---|
| 138 | Unicode width assumptions are combined. */
|
|---|
| 139 |
|
|---|
| 140 | #ifndef Py_UNICODE_WIDE
|
|---|
| 141 |
|
|---|
| 142 | # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
|
|---|
| 143 | # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
|
|---|
| 144 | # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
|
|---|
| 145 | # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
|
|---|
| 146 | # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
|
|---|
| 147 | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
|
|---|
| 148 | # define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
|
|---|
| 149 | # define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
|
|---|
| 150 | # define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
|
|---|
| 151 | # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
|
|---|
| 152 | # define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
|
|---|
| 153 | # define PyUnicode_Compare PyUnicodeUCS2_Compare
|
|---|
| 154 | # define PyUnicode_Concat PyUnicodeUCS2_Concat
|
|---|
| 155 | # define PyUnicode_Contains PyUnicodeUCS2_Contains
|
|---|
| 156 | # define PyUnicode_Count PyUnicodeUCS2_Count
|
|---|
| 157 | # define PyUnicode_Decode PyUnicodeUCS2_Decode
|
|---|
| 158 | # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
|
|---|
| 159 | # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
|
|---|
| 160 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
|
|---|
| 161 | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
|
|---|
| 162 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
|
|---|
| 163 | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
|
|---|
| 164 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
|
|---|
| 165 | # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
|
|---|
| 166 | # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
|
|---|
| 167 | # define PyUnicode_Encode PyUnicodeUCS2_Encode
|
|---|
| 168 | # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
|
|---|
| 169 | # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
|
|---|
| 170 | # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
|
|---|
| 171 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
|
|---|
| 172 | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
|
|---|
| 173 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
|
|---|
| 174 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
|
|---|
| 175 | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
|
|---|
| 176 | # define PyUnicode_Find PyUnicodeUCS2_Find
|
|---|
| 177 | # define PyUnicode_Format PyUnicodeUCS2_Format
|
|---|
| 178 | # define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
|
|---|
| 179 | # define PyUnicode_FromObject PyUnicodeUCS2_FromObject
|
|---|
| 180 | # define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
|
|---|
| 181 | # define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
|
|---|
| 182 | # define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
|
|---|
| 183 | # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
|---|
| 184 | # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
|---|
| 185 | # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
|---|
| 186 | # define PyUnicode_Join PyUnicodeUCS2_Join
|
|---|
| 187 | # define PyUnicode_Partition PyUnicodeUCS2_Partition
|
|---|
| 188 | # define PyUnicode_RPartition PyUnicodeUCS2_RPartition
|
|---|
| 189 | # define PyUnicode_RSplit PyUnicodeUCS2_RSplit
|
|---|
| 190 | # define PyUnicode_Replace PyUnicodeUCS2_Replace
|
|---|
| 191 | # define PyUnicode_Resize PyUnicodeUCS2_Resize
|
|---|
| 192 | # define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
|
|---|
| 193 | # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
|
|---|
| 194 | # define PyUnicode_Split PyUnicodeUCS2_Split
|
|---|
| 195 | # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
|
|---|
| 196 | # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
|
|---|
| 197 | # define PyUnicode_Translate PyUnicodeUCS2_Translate
|
|---|
| 198 | # define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
|
|---|
| 199 | # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
|
|---|
| 200 | # define _PyUnicode_Fini _PyUnicodeUCS2_Fini
|
|---|
| 201 | # define _PyUnicode_Init _PyUnicodeUCS2_Init
|
|---|
| 202 | # define _PyUnicode_IsAlpha _PyUnicodeUCS2_IsAlpha
|
|---|
| 203 | # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS2_IsDecimalDigit
|
|---|
| 204 | # define _PyUnicode_IsDigit _PyUnicodeUCS2_IsDigit
|
|---|
| 205 | # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
|---|
| 206 | # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
|---|
| 207 | # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
|---|
| 208 | # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
|---|
| 209 | # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
|---|
| 210 | # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
|---|
| 211 | # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS2_ToDecimalDigit
|
|---|
| 212 | # define _PyUnicode_ToDigit _PyUnicodeUCS2_ToDigit
|
|---|
| 213 | # define _PyUnicode_ToLowercase _PyUnicodeUCS2_ToLowercase
|
|---|
| 214 | # define _PyUnicode_ToNumeric _PyUnicodeUCS2_ToNumeric
|
|---|
| 215 | # define _PyUnicode_ToTitlecase _PyUnicodeUCS2_ToTitlecase
|
|---|
| 216 | # define _PyUnicode_ToUppercase _PyUnicodeUCS2_ToUppercase
|
|---|
| 217 |
|
|---|
| 218 | #else
|
|---|
| 219 |
|
|---|
| 220 | # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
|
|---|
| 221 | # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
|
|---|
| 222 | # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
|
|---|
| 223 | # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
|
|---|
| 224 | # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
|
|---|
| 225 | # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
|
|---|
| 226 | # define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
|
|---|
| 227 | # define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
|
|---|
| 228 | # define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
|
|---|
| 229 | # define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
|
|---|
| 230 | # define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
|
|---|
| 231 | # define PyUnicode_Compare PyUnicodeUCS4_Compare
|
|---|
| 232 | # define PyUnicode_Concat PyUnicodeUCS4_Concat
|
|---|
| 233 | # define PyUnicode_Contains PyUnicodeUCS4_Contains
|
|---|
| 234 | # define PyUnicode_Count PyUnicodeUCS4_Count
|
|---|
| 235 | # define PyUnicode_Decode PyUnicodeUCS4_Decode
|
|---|
| 236 | # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
|
|---|
| 237 | # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
|
|---|
| 238 | # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
|
|---|
| 239 | # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
|
|---|
| 240 | # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
|
|---|
| 241 | # define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
|
|---|
| 242 | # define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
|
|---|
| 243 | # define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
|
|---|
| 244 | # define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
|
|---|
| 245 | # define PyUnicode_Encode PyUnicodeUCS4_Encode
|
|---|
| 246 | # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
|
|---|
| 247 | # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
|
|---|
| 248 | # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
|
|---|
| 249 | # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
|
|---|
| 250 | # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
|
|---|
| 251 | # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
|
|---|
| 252 | # define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
|
|---|
| 253 | # define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
|
|---|
| 254 | # define PyUnicode_Find PyUnicodeUCS4_Find
|
|---|
| 255 | # define PyUnicode_Format PyUnicodeUCS4_Format
|
|---|
| 256 | # define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
|
|---|
| 257 | # define PyUnicode_FromObject PyUnicodeUCS4_FromObject
|
|---|
| 258 | # define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
|
|---|
| 259 | # define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
|
|---|
| 260 | # define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
|
|---|
| 261 | # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
|---|
| 262 | # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
|---|
| 263 | # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
|---|
| 264 | # define PyUnicode_Join PyUnicodeUCS4_Join
|
|---|
| 265 | # define PyUnicode_Partition PyUnicodeUCS4_Partition
|
|---|
| 266 | # define PyUnicode_RPartition PyUnicodeUCS4_RPartition
|
|---|
| 267 | # define PyUnicode_RSplit PyUnicodeUCS4_RSplit
|
|---|
| 268 | # define PyUnicode_Replace PyUnicodeUCS4_Replace
|
|---|
| 269 | # define PyUnicode_Resize PyUnicodeUCS4_Resize
|
|---|
| 270 | # define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
|
|---|
| 271 | # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
|
|---|
| 272 | # define PyUnicode_Split PyUnicodeUCS4_Split
|
|---|
| 273 | # define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
|
|---|
| 274 | # define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
|
|---|
| 275 | # define PyUnicode_Translate PyUnicodeUCS4_Translate
|
|---|
| 276 | # define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
|
|---|
| 277 | # define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
|
|---|
| 278 | # define _PyUnicode_Fini _PyUnicodeUCS4_Fini
|
|---|
| 279 | # define _PyUnicode_Init _PyUnicodeUCS4_Init
|
|---|
| 280 | # define _PyUnicode_IsAlpha _PyUnicodeUCS4_IsAlpha
|
|---|
| 281 | # define _PyUnicode_IsDecimalDigit _PyUnicodeUCS4_IsDecimalDigit
|
|---|
| 282 | # define _PyUnicode_IsDigit _PyUnicodeUCS4_IsDigit
|
|---|
| 283 | # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
|---|
| 284 | # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
|---|
| 285 | # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
|---|
| 286 | # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
|---|
| 287 | # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
|---|
| 288 | # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
|---|
| 289 | # define _PyUnicode_ToDecimalDigit _PyUnicodeUCS4_ToDecimalDigit
|
|---|
| 290 | # define _PyUnicode_ToDigit _PyUnicodeUCS4_ToDigit
|
|---|
| 291 | # define _PyUnicode_ToLowercase _PyUnicodeUCS4_ToLowercase
|
|---|
| 292 | # define _PyUnicode_ToNumeric _PyUnicodeUCS4_ToNumeric
|
|---|
| 293 | # define _PyUnicode_ToTitlecase _PyUnicodeUCS4_ToTitlecase
|
|---|
| 294 | # define _PyUnicode_ToUppercase _PyUnicodeUCS4_ToUppercase
|
|---|
| 295 |
|
|---|
| 296 |
|
|---|
| 297 | #endif
|
|---|
| 298 |
|
|---|
| 299 | /* --- Internal Unicode Operations ---------------------------------------- */
|
|---|
| 300 |
|
|---|
| 301 | /* If you want Python to use the compiler's wctype.h functions instead
|
|---|
| 302 | of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
|
|---|
| 303 | configure Python using --with-wctype-functions. This reduces the
|
|---|
| 304 | interpreter's code size. */
|
|---|
| 305 |
|
|---|
| 306 | #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
|
|---|
| 307 |
|
|---|
| 308 | #include <wctype.h>
|
|---|
| 309 |
|
|---|
| 310 | #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
|
|---|
| 311 |
|
|---|
| 312 | #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
|
|---|
| 313 | #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
|
|---|
| 314 | #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
|
|---|
| 315 | #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
|
|---|
| 316 |
|
|---|
| 317 | #define Py_UNICODE_TOLOWER(ch) towlower(ch)
|
|---|
| 318 | #define Py_UNICODE_TOUPPER(ch) towupper(ch)
|
|---|
| 319 | #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
|
|---|
| 320 |
|
|---|
| 321 | #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
|---|
| 322 | #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
|---|
| 323 | #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
|---|
| 324 |
|
|---|
| 325 | #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
|---|
| 326 | #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
|---|
| 327 | #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
|
|---|
| 328 |
|
|---|
| 329 | #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
|
|---|
| 330 |
|
|---|
| 331 | #else
|
|---|
| 332 |
|
|---|
| 333 | #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
|
|---|
| 334 |
|
|---|
| 335 | #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
|
|---|
| 336 | #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
|
|---|
| 337 | #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
|
|---|
| 338 | #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
|
|---|
| 339 |
|
|---|
| 340 | #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
|
|---|
| 341 | #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
|
|---|
| 342 | #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
|
|---|
| 343 |
|
|---|
| 344 | #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
|
|---|
| 345 | #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
|
|---|
| 346 | #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
|
|---|
| 347 |
|
|---|
| 348 | #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
|
|---|
| 349 | #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
|
|---|
| 350 | #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
|
|---|
| 351 |
|
|---|
| 352 | #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
|
|---|
| 353 |
|
|---|
| 354 | #endif
|
|---|
| 355 |
|
|---|
| 356 | #define Py_UNICODE_ISALNUM(ch) \
|
|---|
| 357 | (Py_UNICODE_ISALPHA(ch) || \
|
|---|
| 358 | Py_UNICODE_ISDECIMAL(ch) || \
|
|---|
| 359 | Py_UNICODE_ISDIGIT(ch) || \
|
|---|
| 360 | Py_UNICODE_ISNUMERIC(ch))
|
|---|
| 361 |
|
|---|
| 362 | #define Py_UNICODE_COPY(target, source, length) \
|
|---|
| 363 | Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
|
|---|
| 364 |
|
|---|
| 365 | #define Py_UNICODE_FILL(target, value, length) do\
|
|---|
| 366 | {Py_ssize_t i_; Py_UNICODE *t_ = (target); Py_UNICODE v_ = (value);\
|
|---|
| 367 | for (i_ = 0; i_ < (length); i_++) t_[i_] = v_;\
|
|---|
| 368 | } while (0)
|
|---|
| 369 |
|
|---|
| 370 | /* check if substring matches at given offset. the offset must be
|
|---|
| 371 | valid, and the substring must not be empty */
|
|---|
| 372 | #define Py_UNICODE_MATCH(string, offset, substring) \
|
|---|
| 373 | ((*((string)->str + (offset)) == *((substring)->str)) && \
|
|---|
| 374 | ((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
|
|---|
| 375 | !memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
|
|---|
| 376 |
|
|---|
| 377 | #ifdef __cplusplus
|
|---|
| 378 | extern "C" {
|
|---|
| 379 | #endif
|
|---|
| 380 |
|
|---|
| 381 | /* --- Unicode Type ------------------------------------------------------- */
|
|---|
| 382 |
|
|---|
| 383 | typedef struct {
|
|---|
| 384 | PyObject_HEAD
|
|---|
| 385 | Py_ssize_t length; /* Length of raw Unicode data in buffer */
|
|---|
| 386 | Py_UNICODE *str; /* Raw Unicode buffer */
|
|---|
| 387 | long hash; /* Hash value; -1 if not set */
|
|---|
| 388 | PyObject *defenc; /* (Default) Encoded version as Python
|
|---|
| 389 | string, or NULL; this is used for
|
|---|
| 390 | implementing the buffer protocol */
|
|---|
| 391 | } PyUnicodeObject;
|
|---|
| 392 |
|
|---|
| 393 | PyAPI_DATA(PyTypeObject) PyUnicode_Type;
|
|---|
| 394 |
|
|---|
| 395 | #define PyUnicode_Check(op) PyObject_TypeCheck(op, &PyUnicode_Type)
|
|---|
| 396 | #define PyUnicode_CheckExact(op) ((op)->ob_type == &PyUnicode_Type)
|
|---|
| 397 |
|
|---|
| 398 | /* Fast access macros */
|
|---|
| 399 | #define PyUnicode_GET_SIZE(op) \
|
|---|
| 400 | (((PyUnicodeObject *)(op))->length)
|
|---|
| 401 | #define PyUnicode_GET_DATA_SIZE(op) \
|
|---|
| 402 | (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
|
|---|
| 403 | #define PyUnicode_AS_UNICODE(op) \
|
|---|
| 404 | (((PyUnicodeObject *)(op))->str)
|
|---|
| 405 | #define PyUnicode_AS_DATA(op) \
|
|---|
| 406 | ((const char *)((PyUnicodeObject *)(op))->str)
|
|---|
| 407 |
|
|---|
| 408 | /* --- Constants ---------------------------------------------------------- */
|
|---|
| 409 |
|
|---|
| 410 | /* This Unicode character will be used as replacement character during
|
|---|
| 411 | decoding if the errors argument is set to "replace". Note: the
|
|---|
| 412 | Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
|
|---|
| 413 | Unicode 3.0. */
|
|---|
| 414 |
|
|---|
| 415 | #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
|
|---|
| 416 |
|
|---|
| 417 | /* === Public API ========================================================= */
|
|---|
| 418 |
|
|---|
| 419 | /* --- Plain Py_UNICODE --------------------------------------------------- */
|
|---|
| 420 |
|
|---|
| 421 | /* Create a Unicode Object from the Py_UNICODE buffer u of the given
|
|---|
| 422 | size.
|
|---|
| 423 |
|
|---|
| 424 | u may be NULL which causes the contents to be undefined. It is the
|
|---|
| 425 | user's responsibility to fill in the needed data afterwards. Note
|
|---|
| 426 | that modifying the Unicode object contents after construction is
|
|---|
| 427 | only allowed if u was set to NULL.
|
|---|
| 428 |
|
|---|
| 429 | The buffer is copied into the new object. */
|
|---|
| 430 |
|
|---|
| 431 | PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
|
|---|
| 432 | const Py_UNICODE *u, /* Unicode buffer */
|
|---|
| 433 | Py_ssize_t size /* size of buffer */
|
|---|
| 434 | );
|
|---|
| 435 |
|
|---|
| 436 | /* Return a read-only pointer to the Unicode object's internal
|
|---|
| 437 | Py_UNICODE buffer. */
|
|---|
| 438 |
|
|---|
| 439 | PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
|
|---|
| 440 | PyObject *unicode /* Unicode object */
|
|---|
| 441 | );
|
|---|
| 442 |
|
|---|
| 443 | /* Get the length of the Unicode object. */
|
|---|
| 444 |
|
|---|
| 445 | PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
|
|---|
| 446 | PyObject *unicode /* Unicode object */
|
|---|
| 447 | );
|
|---|
| 448 |
|
|---|
| 449 | /* Get the maximum ordinal for a Unicode character. */
|
|---|
| 450 | PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
|
|---|
| 451 |
|
|---|
| 452 | /* Resize an already allocated Unicode object to the new size length.
|
|---|
| 453 |
|
|---|
| 454 | *unicode is modified to point to the new (resized) object and 0
|
|---|
| 455 | returned on success.
|
|---|
| 456 |
|
|---|
| 457 | This API may only be called by the function which also called the
|
|---|
| 458 | Unicode constructor. The refcount on the object must be 1. Otherwise,
|
|---|
| 459 | an error is returned.
|
|---|
| 460 |
|
|---|
| 461 | Error handling is implemented as follows: an exception is set, -1
|
|---|
| 462 | is returned and *unicode left untouched.
|
|---|
| 463 |
|
|---|
| 464 | */
|
|---|
| 465 |
|
|---|
| 466 | PyAPI_FUNC(int) PyUnicode_Resize(
|
|---|
| 467 | PyObject **unicode, /* Pointer to the Unicode object */
|
|---|
| 468 | Py_ssize_t length /* New length */
|
|---|
| 469 | );
|
|---|
| 470 |
|
|---|
| 471 | /* Coerce obj to an Unicode object and return a reference with
|
|---|
| 472 | *incremented* refcount.
|
|---|
| 473 |
|
|---|
| 474 | Coercion is done in the following way:
|
|---|
| 475 |
|
|---|
| 476 | 1. String and other char buffer compatible objects are decoded
|
|---|
| 477 | under the assumptions that they contain data using the current
|
|---|
| 478 | default encoding. Decoding is done in "strict" mode.
|
|---|
| 479 |
|
|---|
| 480 | 2. All other objects (including Unicode objects) raise an
|
|---|
| 481 | exception.
|
|---|
| 482 |
|
|---|
| 483 | The API returns NULL in case of an error. The caller is responsible
|
|---|
| 484 | for decref'ing the returned objects.
|
|---|
| 485 |
|
|---|
| 486 | */
|
|---|
| 487 |
|
|---|
| 488 | PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
|
|---|
| 489 | register PyObject *obj, /* Object */
|
|---|
| 490 | const char *encoding, /* encoding */
|
|---|
| 491 | const char *errors /* error handling */
|
|---|
| 492 | );
|
|---|
| 493 |
|
|---|
| 494 | /* Coerce obj to an Unicode object and return a reference with
|
|---|
| 495 | *incremented* refcount.
|
|---|
| 496 |
|
|---|
| 497 | Unicode objects are passed back as-is (subclasses are converted to
|
|---|
| 498 | true Unicode objects), all other objects are delegated to
|
|---|
| 499 | PyUnicode_FromEncodedObject(obj, NULL, "strict") which results in
|
|---|
| 500 | using the default encoding as basis for decoding the object.
|
|---|
| 501 |
|
|---|
| 502 | The API returns NULL in case of an error. The caller is responsible
|
|---|
| 503 | for decref'ing the returned objects.
|
|---|
| 504 |
|
|---|
| 505 | */
|
|---|
| 506 |
|
|---|
| 507 | PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
|
|---|
| 508 | register PyObject *obj /* Object */
|
|---|
| 509 | );
|
|---|
| 510 |
|
|---|
| 511 | /* --- wchar_t support for platforms which support it --------------------- */
|
|---|
| 512 |
|
|---|
| 513 | #ifdef HAVE_WCHAR_H
|
|---|
| 514 |
|
|---|
| 515 | /* Create a Unicode Object from the whcar_t buffer w of the given
|
|---|
| 516 | size.
|
|---|
| 517 |
|
|---|
| 518 | The buffer is copied into the new object. */
|
|---|
| 519 |
|
|---|
| 520 | PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
|
|---|
| 521 | register const wchar_t *w, /* wchar_t buffer */
|
|---|
| 522 | Py_ssize_t size /* size of buffer */
|
|---|
| 523 | );
|
|---|
| 524 |
|
|---|
| 525 | /* Copies the Unicode Object contents into the wchar_t buffer w. At
|
|---|
| 526 | most size wchar_t characters are copied.
|
|---|
| 527 |
|
|---|
| 528 | Note that the resulting wchar_t string may or may not be
|
|---|
| 529 | 0-terminated. It is the responsibility of the caller to make sure
|
|---|
| 530 | that the wchar_t string is 0-terminated in case this is required by
|
|---|
| 531 | the application.
|
|---|
| 532 |
|
|---|
| 533 | Returns the number of wchar_t characters copied (excluding a
|
|---|
| 534 | possibly trailing 0-termination character) or -1 in case of an
|
|---|
| 535 | error. */
|
|---|
| 536 |
|
|---|
| 537 | PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
|
|---|
| 538 | PyUnicodeObject *unicode, /* Unicode object */
|
|---|
| 539 | register wchar_t *w, /* wchar_t buffer */
|
|---|
| 540 | Py_ssize_t size /* size of buffer */
|
|---|
| 541 | );
|
|---|
| 542 |
|
|---|
| 543 | #endif
|
|---|
| 544 |
|
|---|
| 545 | /* --- Unicode ordinals --------------------------------------------------- */
|
|---|
| 546 |
|
|---|
| 547 | /* Create a Unicode Object from the given Unicode code point ordinal.
|
|---|
| 548 |
|
|---|
| 549 | The ordinal must be in range(0x10000) on narrow Python builds
|
|---|
| 550 | (UCS2), and range(0x110000) on wide builds (UCS4). A ValueError is
|
|---|
| 551 | raised in case it is not.
|
|---|
| 552 |
|
|---|
| 553 | */
|
|---|
| 554 |
|
|---|
| 555 | PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
|
|---|
| 556 |
|
|---|
| 557 | /* === Builtin Codecs =====================================================
|
|---|
| 558 |
|
|---|
| 559 | Many of these APIs take two arguments encoding and errors. These
|
|---|
| 560 | parameters encoding and errors have the same semantics as the ones
|
|---|
| 561 | of the builtin unicode() API.
|
|---|
| 562 |
|
|---|
| 563 | Setting encoding to NULL causes the default encoding to be used.
|
|---|
| 564 |
|
|---|
| 565 | Error handling is set by errors which may also be set to NULL
|
|---|
| 566 | meaning to use the default handling defined for the codec. Default
|
|---|
| 567 | error handling for all builtin codecs is "strict" (ValueErrors are
|
|---|
| 568 | raised).
|
|---|
| 569 |
|
|---|
| 570 | The codecs all use a similar interface. Only deviation from the
|
|---|
| 571 | generic ones are documented.
|
|---|
| 572 |
|
|---|
| 573 | */
|
|---|
| 574 |
|
|---|
| 575 | /* --- Manage the default encoding ---------------------------------------- */
|
|---|
| 576 |
|
|---|
| 577 | /* Return a Python string holding the default encoded value of the
|
|---|
| 578 | Unicode object.
|
|---|
| 579 |
|
|---|
| 580 | The resulting string is cached in the Unicode object for subsequent
|
|---|
|
|---|