1 #ifndef Py_UNICODEOBJECT_H 2 #define Py_UNICODEOBJECT_H 3 4 #include <stdarg.h> // va_list 5 6 /* 7 8 Unicode implementation based on original code by Fredrik Lundh, 9 modified by Marc-Andre Lemburg ([email protected]) according to the 10 Unicode Integration Proposal. (See 11 http://www.egenix.com/files/python/unicode-proposal.txt). 12 13 Copyright (c) Corporation for National Research Initiatives. 14 15 16 Original header: 17 -------------------------------------------------------------------- 18 19 * Yet another Unicode string type for Python. This type supports the 20 * 16-bit Basic Multilingual Plane (BMP) only. 21 * 22 * Written by Fredrik Lundh, January 1999. 23 * 24 * Copyright (c) 1999 by Secret Labs AB. 25 * Copyright (c) 1999 by Fredrik Lundh. 26 * 27 * [email protected] 28 * http://www.pythonware.com 29 * 30 * -------------------------------------------------------------------- 31 * This Unicode String Type is 32 * 33 * Copyright (c) 1999 by Secret Labs AB 34 * Copyright (c) 1999 by Fredrik Lundh 35 * 36 * By obtaining, using, and/or copying this software and/or its 37 * associated documentation, you agree that you have read, understood, 38 * and will comply with the following terms and conditions: 39 * 40 * Permission to use, copy, modify, and distribute this software and its 41 * associated documentation for any purpose and without fee is hereby 42 * granted, provided that the above copyright notice appears in all 43 * copies, and that both that copyright notice and this permission notice 44 * appear in supporting documentation, and that the name of Secret Labs 45 * AB or the author not be used in advertising or publicity pertaining to 46 * distribution of the software without specific, written prior 47 * permission. 48 * 49 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO 50 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 51 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR 52 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 53 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 54 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 55 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 56 * -------------------------------------------------------------------- */ 57 58 #include <ctype.h> 59 60 /* === Internal API ======================================================= */ 61 62 /* --- Internal Unicode Format -------------------------------------------- */ 63 64 /* Python 3.x requires unicode */ 65 #define Py_USING_UNICODE 66 67 #ifndef SIZEOF_WCHAR_T 68 #error Must define SIZEOF_WCHAR_T 69 #endif 70 71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T 72 73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE. 74 Otherwise, Unicode strings are stored as UCS-2 (with limited support 75 for UTF-16) */ 76 77 #if Py_UNICODE_SIZE >= 4 78 #define Py_UNICODE_WIDE 79 #endif 80 81 /* Set these flags if the platform has "wchar.h" and the 82 wchar_t type is a 16-bit unsigned type */ 83 /* #define HAVE_WCHAR_H */ 84 /* #define HAVE_USABLE_WCHAR_T */ 85 86 /* If the compiler provides a wchar_t type we try to support it 87 through the interface functions PyUnicode_FromWideChar(), 88 PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */ 89 90 #ifdef HAVE_USABLE_WCHAR_T 91 # ifndef HAVE_WCHAR_H 92 # define HAVE_WCHAR_H 93 # endif 94 #endif 95 96 #ifdef HAVE_WCHAR_H 97 # include <wchar.h> 98 #endif 99 100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective 101 unicode representations. */ 102 typedef uint32_t Py_UCS4; 103 typedef uint16_t Py_UCS2; 104 typedef uint8_t Py_UCS1; 105 106 #ifdef __cplusplus 107 extern "C" { 108 #endif 109 110 111 PyAPI_DATA(PyTypeObject) PyUnicode_Type; 112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type; 113 114 #define PyUnicode_Check(op) \ 115 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS) 116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type) 117 118 /* --- Constants ---------------------------------------------------------- */ 119 120 /* This Unicode character will be used as replacement character during 121 decoding if the errors argument is set to "replace". Note: the 122 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in 123 Unicode 3.0. */ 124 125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD) 126 127 /* === Public API ========================================================= */ 128 129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */ 130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize( 131 const char *u, /* UTF-8 encoded string */ 132 Py_ssize_t size /* size of buffer */ 133 ); 134 135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated 136 UTF-8 encoded bytes. The size is determined with strlen(). */ 137 PyAPI_FUNC(PyObject*) PyUnicode_FromString( 138 const char *u /* UTF-8 encoded string */ 139 ); 140 141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 142 PyAPI_FUNC(PyObject*) PyUnicode_Substring( 143 PyObject *str, 144 Py_ssize_t start, 145 Py_ssize_t end); 146 #endif 147 148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 149 /* Copy the string into a UCS4 buffer including the null character if copy_null 150 is set. Return NULL and raise an exception on error. Raise a SystemError if 151 the buffer is smaller than the string. Return buffer on success. 152 153 buflen is the length of the buffer in (Py_UCS4) characters. */ 154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4( 155 PyObject *unicode, 156 Py_UCS4* buffer, 157 Py_ssize_t buflen, 158 int copy_null); 159 160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using 161 * PyMem_Malloc; if this fails, NULL is returned with a memory error 162 exception set. */ 163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode); 164 #endif 165 166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 167 /* Get the length of the Unicode object. */ 168 169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength( 170 PyObject *unicode 171 ); 172 #endif 173 174 /* Get the number of Py_UNICODE units in the 175 string representation. */ 176 177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize( 178 PyObject *unicode /* Unicode object */ 179 ); 180 181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 182 /* Read a character from the string. */ 183 184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar( 185 PyObject *unicode, 186 Py_ssize_t index 187 ); 188 189 /* Write a character to the string. The string must have been created through 190 PyUnicode_New, must not be shared, and must not have been hashed yet. 191 192 Return 0 on success, -1 on error. */ 193 194 PyAPI_FUNC(int) PyUnicode_WriteChar( 195 PyObject *unicode, 196 Py_ssize_t index, 197 Py_UCS4 character 198 ); 199 #endif 200 201 /* Resize a Unicode object. The length is the number of characters, except 202 if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length 203 is the number of Py_UNICODE characters. 204 205 *unicode is modified to point to the new (resized) object and 0 206 returned on success. 207 208 Try to resize the string in place (which is usually faster than allocating 209 a new string and copy characters), or create a new string. 210 211 Error handling is implemented as follows: an exception is set, -1 212 is returned and *unicode left untouched. 213 214 WARNING: The function doesn't check string content, the result may not be a 215 string in canonical representation. */ 216 217 PyAPI_FUNC(int) PyUnicode_Resize( 218 PyObject **unicode, /* Pointer to the Unicode object */ 219 Py_ssize_t length /* New length */ 220 ); 221 222 /* Decode obj to a Unicode object. 223 224 bytes, bytearray and other bytes-like objects are decoded according to the 225 given encoding and error handler. The encoding and error handler can be 226 NULL to have the interface use UTF-8 and "strict". 227 228 All other objects (including Unicode objects) raise an exception. 229 230 The API returns NULL in case of an error. The caller is responsible 231 for decref'ing the returned objects. 232 233 */ 234 235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject( 236 PyObject *obj, /* Object */ 237 const char *encoding, /* encoding */ 238 const char *errors /* error handling */ 239 ); 240 241 /* Copy an instance of a Unicode subtype to a new true Unicode object if 242 necessary. If obj is already a true Unicode object (not a subtype), return 243 the reference with *incremented* refcount. 244 245 The API returns NULL in case of an error. The caller is responsible 246 for decref'ing the returned objects. 247 248 */ 249 250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject( 251 PyObject *obj /* Object */ 252 ); 253 254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( 255 const char *format, /* ASCII-encoded string */ 256 va_list vargs 257 ); 258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( 259 const char *format, /* ASCII-encoded string */ 260 ... 261 ); 262 263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **); 264 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString( 265 const char *u /* UTF-8 encoded string */ 266 ); 267 268 // PyUnicode_InternImmortal() is deprecated since Python 3.10 269 // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead. 270 Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **); 271 272 /* --- wchar_t support for platforms which support it --------------------- */ 273 274 #ifdef HAVE_WCHAR_H 275 276 /* Create a Unicode Object from the wchar_t buffer w of the given 277 size. 278 279 The buffer is copied into the new object. */ 280 281 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar( 282 const wchar_t *w, /* wchar_t buffer */ 283 Py_ssize_t size /* size of buffer */ 284 ); 285 286 /* Copies the Unicode Object contents into the wchar_t buffer w. At 287 most size wchar_t characters are copied. 288 289 Note that the resulting wchar_t string may or may not be 290 0-terminated. It is the responsibility of the caller to make sure 291 that the wchar_t string is 0-terminated in case this is required by 292 the application. 293 294 Returns the number of wchar_t characters copied (excluding a 295 possibly trailing 0-termination character) or -1 in case of an 296 error. */ 297 298 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar( 299 PyObject *unicode, /* Unicode object */ 300 wchar_t *w, /* wchar_t buffer */ 301 Py_ssize_t size /* size of buffer */ 302 ); 303 304 /* Convert the Unicode object to a wide character string. The output string 305 always ends with a nul character. If size is not NULL, write the number of 306 wide characters (excluding the null character) into *size. 307 308 Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it) 309 on success. On error, returns NULL, *size is undefined and raises a 310 MemoryError. */ 311 312 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString( 313 PyObject *unicode, /* Unicode object */ 314 Py_ssize_t *size /* number of characters of the result */ 315 ); 316 317 #endif 318 319 /* --- Unicode ordinals --------------------------------------------------- */ 320 321 /* Create a Unicode Object from the given Unicode code point ordinal. 322 323 The ordinal must be in range(0x110000). A ValueError is 324 raised in case it is not. 325 326 */ 327 328 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal); 329 330 /* === Builtin Codecs ===================================================== 331 332 Many of these APIs take two arguments encoding and errors. These 333 parameters encoding and errors have the same semantics as the ones 334 of the builtin str() API. 335 336 Setting encoding to NULL causes the default encoding (UTF-8) to be used. 337 338 Error handling is set by errors which may also be set to NULL 339 meaning to use the default handling defined for the codec. Default 340 error handling for all builtin codecs is "strict" (ValueErrors are 341 raised). 342 343 The codecs all use a similar interface. Only deviation from the 344 generic ones are documented. 345 346 */ 347 348 /* --- Manage the default encoding ---------------------------------------- */ 349 350 /* Returns "utf-8". */ 351 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void); 352 353 /* --- Generic Codecs ----------------------------------------------------- */ 354 355 /* Create a Unicode object by decoding the encoded string s of the 356 given size. */ 357 358 PyAPI_FUNC(PyObject*) PyUnicode_Decode( 359 const char *s, /* encoded string */ 360 Py_ssize_t size, /* size of buffer */ 361 const char *encoding, /* encoding */ 362 const char *errors /* error handling */ 363 ); 364 365 /* Decode a Unicode object unicode and return the result as Python 366 object. 367 368 This API is DEPRECATED. The only supported standard encoding is rot13. 369 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 370 that decode from str. */ 371 372 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( 373 PyObject *unicode, /* Unicode object */ 374 const char *encoding, /* encoding */ 375 const char *errors /* error handling */ 376 ); 377 378 /* Decode a Unicode object unicode and return the result as Unicode 379 object. 380 381 This API is DEPRECATED. The only supported standard encoding is rot13. 382 Use PyCodec_Decode() to decode with rot13 and non-standard codecs 383 that decode from str to str. */ 384 385 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( 386 PyObject *unicode, /* Unicode object */ 387 const char *encoding, /* encoding */ 388 const char *errors /* error handling */ 389 ); 390 391 /* Encodes a Unicode object and returns the result as Python 392 object. 393 394 This API is DEPRECATED. It is superseded by PyUnicode_AsEncodedString() 395 since all standard encodings (except rot13) encode str to bytes. 396 Use PyCodec_Encode() for encoding with rot13 and non-standard codecs 397 that encode form str to non-bytes. */ 398 399 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject( 400 PyObject *unicode, /* Unicode object */ 401 const char *encoding, /* encoding */ 402 const char *errors /* error handling */ 403 ); 404 405 /* Encodes a Unicode object and returns the result as Python string 406 object. */ 407 408 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( 409 PyObject *unicode, /* Unicode object */ 410 const char *encoding, /* encoding */ 411 const char *errors /* error handling */ 412 ); 413 414 /* Encodes a Unicode object and returns the result as Unicode 415 object. 416 417 This API is DEPRECATED. The only supported standard encodings is rot13. 418 Use PyCodec_Encode() to encode with rot13 and non-standard codecs 419 that encode from str to str. */ 420 421 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( 422 PyObject *unicode, /* Unicode object */ 423 const char *encoding, /* encoding */ 424 const char *errors /* error handling */ 425 ); 426 427 /* Build an encoding map. */ 428 429 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( 430 PyObject* string /* 256 character map */ 431 ); 432 433 /* --- UTF-7 Codecs ------------------------------------------------------- */ 434 435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( 436 const char *string, /* UTF-7 encoded string */ 437 Py_ssize_t length, /* size of string */ 438 const char *errors /* error handling */ 439 ); 440 441 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful( 442 const char *string, /* UTF-7 encoded string */ 443 Py_ssize_t length, /* size of string */ 444 const char *errors, /* error handling */ 445 Py_ssize_t *consumed /* bytes consumed */ 446 ); 447 448 /* --- UTF-8 Codecs ------------------------------------------------------- */ 449 450 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8( 451 const char *string, /* UTF-8 encoded string */ 452 Py_ssize_t length, /* size of string */ 453 const char *errors /* error handling */ 454 ); 455 456 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful( 457 const char *string, /* UTF-8 encoded string */ 458 Py_ssize_t length, /* size of string */ 459 const char *errors, /* error handling */ 460 Py_ssize_t *consumed /* bytes consumed */ 461 ); 462 463 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String( 464 PyObject *unicode /* Unicode object */ 465 ); 466 467 /* Returns a pointer to the default encoding (UTF-8) of the 468 Unicode object unicode and the size of the encoded representation 469 in bytes stored in *size. 470 471 In case of an error, no *size is set. 472 473 This function caches the UTF-8 encoded string in the unicodeobject 474 and subsequent calls will return the same string. The memory is released 475 when the unicodeobject is deallocated. 476 */ 477 478 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000 479 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( 480 PyObject *unicode, 481 Py_ssize_t *size); 482 #endif 483 484 /* --- UTF-32 Codecs ------------------------------------------------------ */ 485 486 /* Decodes length bytes from a UTF-32 encoded buffer string and returns 487 the corresponding Unicode object. 488 489 errors (if non-NULL) defines the error handling. It defaults 490 to "strict". 491 492 If byteorder is non-NULL, the decoder starts decoding using the 493 given byte order: 494 495 *byteorder == -1: little endian 496 *byteorder == 0: native order 497 *byteorder == 1: big endian 498 499 In native mode, the first four bytes of the stream are checked for a 500 BOM mark. If found, the BOM mark is analysed, the byte order 501 adjusted and the BOM skipped. In the other modes, no BOM mark 502 interpretation is done. After completion, *byteorder is set to the 503 current byte order at the end of input data. 504 505 If byteorder is NULL, the codec starts in native order mode. 506 507 */ 508 509 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32( 510 const char *string, /* UTF-32 encoded string */ 511 Py_ssize_t length, /* size of string */ 512 const char *errors, /* error handling */ 513 int *byteorder /* pointer to byteorder to use 514 0=native;-1=LE,1=BE; updated on 515 exit */ 516 ); 517 518 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful( 519 const char *string, /* UTF-32 encoded string */ 520 Py_ssize_t length, /* size of string */ 521 const char *errors, /* error handling */ 522 int *byteorder, /* pointer to byteorder to use 523 0=native;-1=LE,1=BE; updated on 524 exit */ 525 Py_ssize_t *consumed /* bytes consumed */ 526 ); 527 528 /* Returns a Python string using the UTF-32 encoding in native byte 529 order. The string always starts with a BOM mark. */ 530 531 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String( 532 PyObject *unicode /* Unicode object */ 533 ); 534 535 /* Returns a Python string object holding the UTF-32 encoded value of 536 the Unicode data. 537 538 If byteorder is not 0, output is written according to the following 539 byte order: 540 541 byteorder == -1: little endian 542 byteorder == 0: native byte order (writes a BOM mark) 543 byteorder == 1: big endian 544 545 If byteorder is 0, the output string will always start with the 546 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is 547 prepended. 548 549 */ 550 551 /* --- UTF-16 Codecs ------------------------------------------------------ */ 552 553 /* Decodes length bytes from a UTF-16 encoded buffer string and returns 554 the corresponding Unicode object. 555 556 errors (if non-NULL) defines the error handling. It defaults 557 to "strict". 558 559 If byteorder is non-NULL, the decoder starts decoding using the 560 given byte order: 561 562 *byteorder == -1: little endian 563 *byteorder == 0: native order 564 *byteorder == 1: big endian 565 566 In native mode, the first two bytes of the stream are checked for a 567 BOM mark. If found, the BOM mark is analysed, the byte order 568 adjusted and the BOM skipped. In the other modes, no BOM mark 569 interpretation is done. After completion, *byteorder is set to the 570 current byte order at the end of input data. 571 572 If byteorder is NULL, the codec starts in native order mode. 573 574 */ 575 576 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16( 577 const char *string, /* UTF-16 encoded string */ 578 Py_ssize_t length, /* size of string */ 579 const char *errors, /* error handling */ 580 int *byteorder /* pointer to byteorder to use 581 0=native;-1=LE,1=BE; updated on 582 exit */ 583 ); 584 585 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful( 586 const char *string, /* UTF-16 encoded string */ 587 Py_ssize_t length, /* size of string */ 588 const char *errors, /* error handling */ 589 int *byteorder, /* pointer to byteorder to use 590 0=native;-1=LE,1=BE; updated on 591 exit */ 592 Py_ssize_t *consumed /* bytes consumed */ 593 ); 594 595 /* Returns a Python string using the UTF-16 encoding in native byte 596 order. The string always starts with a BOM mark. */ 597 598 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String( 599 PyObject *unicode /* Unicode object */ 600 ); 601 602 /* --- Unicode-Escape Codecs ---------------------------------------------- */ 603 604 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape( 605 const char *string, /* Unicode-Escape encoded string */ 606 Py_ssize_t length, /* size of string */ 607 const char *errors /* error handling */ 608 ); 609 610 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( 611 PyObject *unicode /* Unicode object */ 612 ); 613 614 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */ 615 616 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape( 617 const char *string, /* Raw-Unicode-Escape encoded string */ 618 Py_ssize_t length, /* size of string */ 619 const char *errors /* error handling */ 620 ); 621 622 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString( 623 PyObject *unicode /* Unicode object */ 624 ); 625 626 /* --- Latin-1 Codecs ----------------------------------------------------- 627 628 Note: Latin-1 corresponds to the first 256 Unicode ordinals. */ 629 630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1( 631 const char *string, /* Latin-1 encoded string */ 632 Py_ssize_t length, /* size of string */ 633 const char *errors /* error handling */ 634 ); 635 636 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String( 637 PyObject *unicode /* Unicode object */ 638 ); 639 640 /* --- ASCII Codecs ------------------------------------------------------- 641 642 Only 7-bit ASCII data is excepted. All other codes generate errors. 643 644 */ 645 646 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII( 647 const char *string, /* ASCII encoded string */ 648 Py_ssize_t length, /* size of string */ 649 const char *errors /* error handling */ 650 ); 651 652 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString( 653 PyObject *unicode /* Unicode object */ 654 ); 655 656 /* --- Character Map Codecs ----------------------------------------------- 657 658 This codec uses mappings to encode and decode characters. 659 660 Decoding mappings must map byte ordinals (integers in the range from 0 to 661 255) to Unicode strings, integers (which are then interpreted as Unicode 662 ordinals) or None. Unmapped data bytes (ones which cause a LookupError) 663 as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined 664 mapping" and cause an error. 665 666 Encoding mappings must map Unicode ordinal integers to bytes objects, 667 integers in the range from 0 to 255 or None. Unmapped character 668 ordinals (ones which cause a LookupError) as well as mapped to 669 None are treated as "undefined mapping" and cause an error. 670 671 */ 672 673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap( 674 const char *string, /* Encoded string */ 675 Py_ssize_t length, /* size of string */ 676 PyObject *mapping, /* decoding mapping */ 677 const char *errors /* error handling */ 678 ); 679 680 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString( 681 PyObject *unicode, /* Unicode object */ 682 PyObject *mapping /* encoding mapping */ 683 ); 684 685 /* --- MBCS codecs for Windows -------------------------------------------- */ 686 687 #ifdef MS_WINDOWS 688 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS( 689 const char *string, /* MBCS encoded string */ 690 Py_ssize_t length, /* size of string */ 691 const char *errors /* error handling */ 692 ); 693 694 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( 695 const char *string, /* MBCS encoded string */ 696 Py_ssize_t length, /* size of string */ 697 const char *errors, /* error handling */ 698 Py_ssize_t *consumed /* bytes consumed */ 699 ); 700 701 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 702 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful( 703 int code_page, /* code page number */ 704 const char *string, /* encoded string */ 705 Py_ssize_t length, /* size of string */ 706 const char *errors, /* error handling */ 707 Py_ssize_t *consumed /* bytes consumed */ 708 ); 709 #endif 710 711 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( 712 PyObject *unicode /* Unicode object */ 713 ); 714 715 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 716 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage( 717 int code_page, /* code page number */ 718 PyObject *unicode, /* Unicode object */ 719 const char *errors /* error handling */ 720 ); 721 #endif 722 723 #endif /* MS_WINDOWS */ 724 725 /* --- Locale encoding --------------------------------------------------- */ 726 727 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 728 /* Decode a string from the current locale encoding. The decoder is strict if 729 *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape' 730 error handler (PEP 383) to escape undecodable bytes. If a byte sequence can 731 be decoded as a surrogate character and *surrogateescape* is not equal to 732 zero, the byte sequence is escaped using the 'surrogateescape' error handler 733 instead of being decoded. *str* must end with a null character but cannot 734 contain embedded null characters. */ 735 736 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize( 737 const char *str, 738 Py_ssize_t len, 739 const char *errors); 740 741 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string 742 length using strlen(). */ 743 744 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale( 745 const char *str, 746 const char *errors); 747 748 /* Encode a Unicode object to the current locale encoding. The encoder is 749 strict is *surrogateescape* is equal to zero, otherwise the 750 "surrogateescape" error handler is used. Return a bytes object. The string 751 cannot contain embedded null characters. */ 752 753 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale( 754 PyObject *unicode, 755 const char *errors 756 ); 757 #endif 758 759 /* --- File system encoding ---------------------------------------------- */ 760 761 /* ParseTuple converter: encode str objects to bytes using 762 PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */ 763 764 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*); 765 766 /* ParseTuple converter: decode bytes objects to unicode using 767 PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */ 768 769 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*); 770 771 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding 772 and the "surrogateescape" error handler. 773 774 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 775 encoding. 776 777 Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known. 778 */ 779 780 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault( 781 const char *s /* encoded string */ 782 ); 783 784 /* Decode a string using Py_FileSystemDefaultEncoding 785 and the "surrogateescape" error handler. 786 787 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 788 encoding. 789 */ 790 791 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize( 792 const char *s, /* encoded string */ 793 Py_ssize_t size /* size */ 794 ); 795 796 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the 797 "surrogateescape" error handler, and return bytes. 798 799 If Py_FileSystemDefaultEncoding is not set, fall back to the locale 800 encoding. 801 */ 802 803 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault( 804 PyObject *unicode 805 ); 806 807 /* --- Methods & Slots ---------------------------------------------------- 808 809 These are capable of handling Unicode objects and strings on input 810 (we refer to them as strings in the descriptions) and return 811 Unicode objects or integers as appropriate. */ 812 813 /* Concat two strings giving a new Unicode string. */ 814 815 PyAPI_FUNC(PyObject*) PyUnicode_Concat( 816 PyObject *left, /* Left string */ 817 PyObject *right /* Right string */ 818 ); 819 820 /* Concat two strings and put the result in *pleft 821 (sets *pleft to NULL on error) */ 822 823 PyAPI_FUNC(void) PyUnicode_Append( 824 PyObject **pleft, /* Pointer to left string */ 825 PyObject *right /* Right string */ 826 ); 827 828 /* Concat two strings, put the result in *pleft and drop the right object 829 (sets *pleft to NULL on error) */ 830 831 PyAPI_FUNC(void) PyUnicode_AppendAndDel( 832 PyObject **pleft, /* Pointer to left string */ 833 PyObject *right /* Right string */ 834 ); 835 836 /* Split a string giving a list of Unicode strings. 837 838 If sep is NULL, splitting will be done at all whitespace 839 substrings. Otherwise, splits occur at the given separator. 840 841 At most maxsplit splits will be done. If negative, no limit is set. 842 843 Separators are not included in the resulting list. 844 845 */ 846 847 PyAPI_FUNC(PyObject*) PyUnicode_Split( 848 PyObject *s, /* String to split */ 849 PyObject *sep, /* String separator */ 850 Py_ssize_t maxsplit /* Maxsplit count */ 851 ); 852 853 /* Dito, but split at line breaks. 854 855 CRLF is considered to be one line break. Line breaks are not 856 included in the resulting list. */ 857 858 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( 859 PyObject *s, /* String to split */ 860 int keepends /* If true, line end markers are included */ 861 ); 862 863 /* Partition a string using a given separator. */ 864 865 PyAPI_FUNC(PyObject*) PyUnicode_Partition( 866 PyObject *s, /* String to partition */ 867 PyObject *sep /* String separator */ 868 ); 869 870 /* Partition a string using a given separator, searching from the end of the 871 string. */ 872 873 PyAPI_FUNC(PyObject*) PyUnicode_RPartition( 874 PyObject *s, /* String to partition */ 875 PyObject *sep /* String separator */ 876 ); 877 878 /* Split a string giving a list of Unicode strings. 879 880 If sep is NULL, splitting will be done at all whitespace 881 substrings. Otherwise, splits occur at the given separator. 882 883 At most maxsplit splits will be done. But unlike PyUnicode_Split 884 PyUnicode_RSplit splits from the end of the string. If negative, 885 no limit is set. 886 887 Separators are not included in the resulting list. 888 889 */ 890 891 PyAPI_FUNC(PyObject*) PyUnicode_RSplit( 892 PyObject *s, /* String to split */ 893 PyObject *sep, /* String separator */ 894 Py_ssize_t maxsplit /* Maxsplit count */ 895 ); 896 897 /* Translate a string by applying a character mapping table to it and 898 return the resulting Unicode object. 899 900 The mapping table must map Unicode ordinal integers to Unicode strings, 901 Unicode ordinal integers or None (causing deletion of the character). 902 903 Mapping tables may be dictionaries or sequences. Unmapped character 904 ordinals (ones which cause a LookupError) are left untouched and 905 are copied as-is. 906 907 */ 908 909 PyAPI_FUNC(PyObject *) PyUnicode_Translate( 910 PyObject *str, /* String */ 911 PyObject *table, /* Translate table */ 912 const char *errors /* error handling */ 913 ); 914 915 /* Join a sequence of strings using the given separator and return 916 the resulting Unicode string. */ 917 918 PyAPI_FUNC(PyObject*) PyUnicode_Join( 919 PyObject *separator, /* Separator string */ 920 PyObject *seq /* Sequence object */ 921 ); 922 923 /* Return 1 if substr matches str[start:end] at the given tail end, 0 924 otherwise. */ 925 926 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch( 927 PyObject *str, /* String */ 928 PyObject *substr, /* Prefix or Suffix string */ 929 Py_ssize_t start, /* Start index */ 930 Py_ssize_t end, /* Stop index */ 931 int direction /* Tail end: -1 prefix, +1 suffix */ 932 ); 933 934 /* Return the first position of substr in str[start:end] using the 935 given search direction or -1 if not found. -2 is returned in case 936 an error occurred and an exception is set. */ 937 938 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find( 939 PyObject *str, /* String */ 940 PyObject *substr, /* Substring to find */ 941 Py_ssize_t start, /* Start index */ 942 Py_ssize_t end, /* Stop index */ 943 int direction /* Find direction: +1 forward, -1 backward */ 944 ); 945 946 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000 947 /* Like PyUnicode_Find, but search for single character only. */ 948 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar( 949 PyObject *str, 950 Py_UCS4 ch, 951 Py_ssize_t start, 952 Py_ssize_t end, 953 int direction 954 ); 955 #endif 956 957 /* Count the number of occurrences of substr in str[start:end]. */ 958 959 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count( 960 PyObject *str, /* String */ 961 PyObject *substr, /* Substring to count */ 962 Py_ssize_t start, /* Start index */ 963 Py_ssize_t end /* Stop index */ 964 ); 965 966 /* Replace at most maxcount occurrences of substr in str with replstr 967 and return the resulting Unicode object. */ 968 969 PyAPI_FUNC(PyObject *) PyUnicode_Replace( 970 PyObject *str, /* String */ 971 PyObject *substr, /* Substring to find */ 972 PyObject *replstr, /* Substring to replace */ 973 Py_ssize_t maxcount /* Max. number of replacements to apply; 974 -1 = all */ 975 ); 976 977 /* Compare two strings and return -1, 0, 1 for less than, equal, 978 greater than resp. 979 Raise an exception and return -1 on error. */ 980 981 PyAPI_FUNC(int) PyUnicode_Compare( 982 PyObject *left, /* Left string */ 983 PyObject *right /* Right string */ 984 ); 985 986 /* Compare a Unicode object with C string and return -1, 0, 1 for less than, 987 equal, and greater than, respectively. It is best to pass only 988 ASCII-encoded strings, but the function interprets the input string as 989 ISO-8859-1 if it contains non-ASCII characters. 990 This function does not raise exceptions. */ 991 992 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString( 993 PyObject *left, 994 const char *right /* ASCII-encoded string */ 995 ); 996 997 /* Rich compare two strings and return one of the following: 998 999 - NULL in case an exception was raised 1000 - Py_True or Py_False for successful comparisons 1001 - Py_NotImplemented in case the type combination is unknown 1002 1003 Possible values for op: 1004 1005 Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE 1006 1007 */ 1008 1009 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare( 1010 PyObject *left, /* Left string */ 1011 PyObject *right, /* Right string */ 1012 int op /* Operation: Py_EQ, Py_NE, Py_GT, etc. */ 1013 ); 1014 1015 /* Apply an argument tuple or dictionary to a format string and return 1016 the resulting Unicode string. */ 1017 1018 PyAPI_FUNC(PyObject *) PyUnicode_Format( 1019 PyObject *format, /* Format string */ 1020 PyObject *args /* Argument tuple or dictionary */ 1021 ); 1022 1023 /* Checks whether element is contained in container and return 1/0 1024 accordingly. 1025 1026 element has to coerce to a one element Unicode string. -1 is 1027 returned in case of an error. */ 1028 1029 PyAPI_FUNC(int) PyUnicode_Contains( 1030 PyObject *container, /* Container string */ 1031 PyObject *element /* Element string */ 1032 ); 1033 1034 /* Checks whether argument is a valid identifier. */ 1035 1036 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); 1037 1038 /* === Characters Type APIs =============================================== */ 1039 1040 #ifndef Py_LIMITED_API 1041 # define Py_CPYTHON_UNICODEOBJECT_H 1042 # include "cpython/unicodeobject.h" 1043 # undef Py_CPYTHON_UNICODEOBJECT_H 1044 #endif 1045 1046 #ifdef __cplusplus 1047 } 1048 #endif 1049 #endif /* !Py_UNICODEOBJECT_H */ 1050