1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
3 
4 #include <stdarg.h>               // va_list
5 
6 /*
7 
8 Unicode implementation based on original code by Fredrik Lundh,
9 modified by Marc-Andre Lemburg ([email protected]) according to the
10 Unicode Integration Proposal. (See
11 http://www.egenix.com/files/python/unicode-proposal.txt).
12 
13 Copyright (c) Corporation for National Research Initiatives.
14 
15 
16  Original header:
17  --------------------------------------------------------------------
18 
19  * Yet another Unicode string type for Python.  This type supports the
20  * 16-bit Basic Multilingual Plane (BMP) only.
21  *
22  * Written by Fredrik Lundh, January 1999.
23  *
24  * Copyright (c) 1999 by Secret Labs AB.
25  * Copyright (c) 1999 by Fredrik Lundh.
26  *
27  * [email protected]
28  * http://www.pythonware.com
29  *
30  * --------------------------------------------------------------------
31  * This Unicode String Type is
32  *
33  * Copyright (c) 1999 by Secret Labs AB
34  * Copyright (c) 1999 by Fredrik Lundh
35  *
36  * By obtaining, using, and/or copying this software and/or its
37  * associated documentation, you agree that you have read, understood,
38  * and will comply with the following terms and conditions:
39  *
40  * Permission to use, copy, modify, and distribute this software and its
41  * associated documentation for any purpose and without fee is hereby
42  * granted, provided that the above copyright notice appears in all
43  * copies, and that both that copyright notice and this permission notice
44  * appear in supporting documentation, and that the name of Secret Labs
45  * AB or the author not be used in advertising or publicity pertaining to
46  * distribution of the software without specific, written prior
47  * permission.
48  *
49  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
50  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
51  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
52  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
53  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
54  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
55  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
56  * -------------------------------------------------------------------- */
57 
58 #include <ctype.h>
59 
60 /* === Internal API ======================================================= */
61 
62 /* --- Internal Unicode Format -------------------------------------------- */
63 
64 /* Python 3.x requires unicode */
65 #define Py_USING_UNICODE
66 
67 #ifndef SIZEOF_WCHAR_T
68 #error Must define SIZEOF_WCHAR_T
69 #endif
70 
71 #define Py_UNICODE_SIZE SIZEOF_WCHAR_T
72 
73 /* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
74    Otherwise, Unicode strings are stored as UCS-2 (with limited support
75    for UTF-16) */
76 
77 #if Py_UNICODE_SIZE >= 4
78 #define Py_UNICODE_WIDE
79 #endif
80 
81 /* Set these flags if the platform has "wchar.h" and the
82    wchar_t type is a 16-bit unsigned type */
83 /* #define HAVE_WCHAR_H */
84 /* #define HAVE_USABLE_WCHAR_T */
85 
86 /* If the compiler provides a wchar_t type we try to support it
87    through the interface functions PyUnicode_FromWideChar(),
88    PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
89 
90 #ifdef HAVE_USABLE_WCHAR_T
91 # ifndef HAVE_WCHAR_H
92 #  define HAVE_WCHAR_H
93 # endif
94 #endif
95 
96 #ifdef HAVE_WCHAR_H
97 #  include <wchar.h>
98 #endif
99 
100 /* Py_UCS4 and Py_UCS2 are typedefs for the respective
101    unicode representations. */
102 typedef uint32_t Py_UCS4;
103 typedef uint16_t Py_UCS2;
104 typedef uint8_t Py_UCS1;
105 
106 #ifdef __cplusplus
107 extern "C" {
108 #endif
109 
110 
111 PyAPI_DATA(PyTypeObject) PyUnicode_Type;
112 PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
113 
114 #define PyUnicode_Check(op) \
115     PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
116 #define PyUnicode_CheckExact(op) Py_IS_TYPE(op, &PyUnicode_Type)
117 
118 /* --- Constants ---------------------------------------------------------- */
119 
120 /* This Unicode character will be used as replacement character during
121    decoding if the errors argument is set to "replace". Note: the
122    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
123    Unicode 3.0. */
124 
125 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
126 
127 /* === Public API ========================================================= */
128 
129 /* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
130 PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
131     const char *u,             /* UTF-8 encoded string */
132     Py_ssize_t size            /* size of buffer */
133     );
134 
135 /* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
136    UTF-8 encoded bytes.  The size is determined with strlen(). */
137 PyAPI_FUNC(PyObject*) PyUnicode_FromString(
138     const char *u              /* UTF-8 encoded string */
139     );
140 
141 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
142 PyAPI_FUNC(PyObject*) PyUnicode_Substring(
143     PyObject *str,
144     Py_ssize_t start,
145     Py_ssize_t end);
146 #endif
147 
148 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
149 /* Copy the string into a UCS4 buffer including the null character if copy_null
150    is set. Return NULL and raise an exception on error. Raise a SystemError if
151    the buffer is smaller than the string. Return buffer on success.
152 
153    buflen is the length of the buffer in (Py_UCS4) characters. */
154 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
155     PyObject *unicode,
156     Py_UCS4* buffer,
157     Py_ssize_t buflen,
158     int copy_null);
159 
160 /* Copy the string into a UCS4 buffer. A new buffer is allocated using
161  * PyMem_Malloc; if this fails, NULL is returned with a memory error
162    exception set. */
163 PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
164 #endif
165 
166 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
167 /* Get the length of the Unicode object. */
168 
169 PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
170     PyObject *unicode
171 );
172 #endif
173 
174 /* Get the number of Py_UNICODE units in the
175    string representation. */
176 
177 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
178     PyObject *unicode           /* Unicode object */
179     );
180 
181 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
182 /* Read a character from the string. */
183 
184 PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
185     PyObject *unicode,
186     Py_ssize_t index
187     );
188 
189 /* Write a character to the string. The string must have been created through
190    PyUnicode_New, must not be shared, and must not have been hashed yet.
191 
192    Return 0 on success, -1 on error. */
193 
194 PyAPI_FUNC(int) PyUnicode_WriteChar(
195     PyObject *unicode,
196     Py_ssize_t index,
197     Py_UCS4 character
198     );
199 #endif
200 
201 /* Resize a Unicode object. The length is the number of characters, except
202    if the kind of the string is PyUnicode_WCHAR_KIND: in this case, the length
203    is the number of Py_UNICODE characters.
204 
205    *unicode is modified to point to the new (resized) object and 0
206    returned on success.
207 
208    Try to resize the string in place (which is usually faster than allocating
209    a new string and copy characters), or create a new string.
210 
211    Error handling is implemented as follows: an exception is set, -1
212    is returned and *unicode left untouched.
213 
214    WARNING: The function doesn't check string content, the result may not be a
215             string in canonical representation. */
216 
217 PyAPI_FUNC(int) PyUnicode_Resize(
218     PyObject **unicode,         /* Pointer to the Unicode object */
219     Py_ssize_t length           /* New length */
220     );
221 
222 /* Decode obj to a Unicode object.
223 
224    bytes, bytearray and other bytes-like objects are decoded according to the
225    given encoding and error handler. The encoding and error handler can be
226    NULL to have the interface use UTF-8 and "strict".
227 
228    All other objects (including Unicode objects) raise an exception.
229 
230    The API returns NULL in case of an error. The caller is responsible
231    for decref'ing the returned objects.
232 
233 */
234 
235 PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
236     PyObject *obj,              /* Object */
237     const char *encoding,       /* encoding */
238     const char *errors          /* error handling */
239     );
240 
241 /* Copy an instance of a Unicode subtype to a new true Unicode object if
242    necessary. If obj is already a true Unicode object (not a subtype), return
243    the reference with *incremented* refcount.
244 
245    The API returns NULL in case of an error. The caller is responsible
246    for decref'ing the returned objects.
247 
248 */
249 
250 PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
251     PyObject *obj      /* Object */
252     );
253 
254 PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
255     const char *format,   /* ASCII-encoded string  */
256     va_list vargs
257     );
258 PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
259     const char *format,   /* ASCII-encoded string  */
260     ...
261     );
262 
263 PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
264 PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
265     const char *u              /* UTF-8 encoded string */
266     );
267 
268 // PyUnicode_InternImmortal() is deprecated since Python 3.10
269 // and will be removed in Python 3.12. Use PyUnicode_InternInPlace() instead.
270 Py_DEPRECATED(3.10) PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
271 
272 /* --- wchar_t support for platforms which support it --------------------- */
273 
274 #ifdef HAVE_WCHAR_H
275 
276 /* Create a Unicode Object from the wchar_t buffer w of the given
277    size.
278 
279    The buffer is copied into the new object. */
280 
281 PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
282     const wchar_t *w,           /* wchar_t buffer */
283     Py_ssize_t size             /* size of buffer */
284     );
285 
286 /* Copies the Unicode Object contents into the wchar_t buffer w.  At
287    most size wchar_t characters are copied.
288 
289    Note that the resulting wchar_t string may or may not be
290    0-terminated.  It is the responsibility of the caller to make sure
291    that the wchar_t string is 0-terminated in case this is required by
292    the application.
293 
294    Returns the number of wchar_t characters copied (excluding a
295    possibly trailing 0-termination character) or -1 in case of an
296    error. */
297 
298 PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
299     PyObject *unicode,          /* Unicode object */
300     wchar_t *w,                 /* wchar_t buffer */
301     Py_ssize_t size             /* size of buffer */
302     );
303 
304 /* Convert the Unicode object to a wide character string. The output string
305    always ends with a nul character. If size is not NULL, write the number of
306    wide characters (excluding the null character) into *size.
307 
308    Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
309    on success. On error, returns NULL, *size is undefined and raises a
310    MemoryError. */
311 
312 PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
313     PyObject *unicode,          /* Unicode object */
314     Py_ssize_t *size            /* number of characters of the result */
315     );
316 
317 #endif
318 
319 /* --- Unicode ordinals --------------------------------------------------- */
320 
321 /* Create a Unicode Object from the given Unicode code point ordinal.
322 
323    The ordinal must be in range(0x110000). A ValueError is
324    raised in case it is not.
325 
326 */
327 
328 PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
329 
330 /* === Builtin Codecs =====================================================
331 
332    Many of these APIs take two arguments encoding and errors. These
333    parameters encoding and errors have the same semantics as the ones
334    of the builtin str() API.
335 
336    Setting encoding to NULL causes the default encoding (UTF-8) to be used.
337 
338    Error handling is set by errors which may also be set to NULL
339    meaning to use the default handling defined for the codec. Default
340    error handling for all builtin codecs is "strict" (ValueErrors are
341    raised).
342 
343    The codecs all use a similar interface. Only deviation from the
344    generic ones are documented.
345 
346 */
347 
348 /* --- Manage the default encoding ---------------------------------------- */
349 
350 /* Returns "utf-8".  */
351 PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
352 
353 /* --- Generic Codecs ----------------------------------------------------- */
354 
355 /* Create a Unicode object by decoding the encoded string s of the
356    given size. */
357 
358 PyAPI_FUNC(PyObject*) PyUnicode_Decode(
359     const char *s,              /* encoded string */
360     Py_ssize_t size,            /* size of buffer */
361     const char *encoding,       /* encoding */
362     const char *errors          /* error handling */
363     );
364 
365 /* Decode a Unicode object unicode and return the result as Python
366    object.
367 
368    This API is DEPRECATED. The only supported standard encoding is rot13.
369    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
370    that decode from str. */
371 
372 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
373     PyObject *unicode,          /* Unicode object */
374     const char *encoding,       /* encoding */
375     const char *errors          /* error handling */
376     );
377 
378 /* Decode a Unicode object unicode and return the result as Unicode
379    object.
380 
381    This API is DEPRECATED. The only supported standard encoding is rot13.
382    Use PyCodec_Decode() to decode with rot13 and non-standard codecs
383    that decode from str to str. */
384 
385 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
386     PyObject *unicode,          /* Unicode object */
387     const char *encoding,       /* encoding */
388     const char *errors          /* error handling */
389     );
390 
391 /* Encodes a Unicode object and returns the result as Python
392    object.
393 
394    This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
395    since all standard encodings (except rot13) encode str to bytes.
396    Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
397    that encode form str to non-bytes. */
398 
399 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
400     PyObject *unicode,          /* Unicode object */
401     const char *encoding,       /* encoding */
402     const char *errors          /* error handling */
403     );
404 
405 /* Encodes a Unicode object and returns the result as Python string
406    object. */
407 
408 PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
409     PyObject *unicode,          /* Unicode object */
410     const char *encoding,       /* encoding */
411     const char *errors          /* error handling */
412     );
413 
414 /* Encodes a Unicode object and returns the result as Unicode
415    object.
416 
417    This API is DEPRECATED.  The only supported standard encodings is rot13.
418    Use PyCodec_Encode() to encode with rot13 and non-standard codecs
419    that encode from str to str. */
420 
421 Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
422     PyObject *unicode,          /* Unicode object */
423     const char *encoding,       /* encoding */
424     const char *errors          /* error handling */
425     );
426 
427 /* Build an encoding map. */
428 
429 PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
430     PyObject* string            /* 256 character map */
431    );
432 
433 /* --- UTF-7 Codecs ------------------------------------------------------- */
434 
435 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
436     const char *string,         /* UTF-7 encoded string */
437     Py_ssize_t length,          /* size of string */
438     const char *errors          /* error handling */
439     );
440 
441 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
442     const char *string,         /* UTF-7 encoded string */
443     Py_ssize_t length,          /* size of string */
444     const char *errors,         /* error handling */
445     Py_ssize_t *consumed        /* bytes consumed */
446     );
447 
448 /* --- UTF-8 Codecs ------------------------------------------------------- */
449 
450 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
451     const char *string,         /* UTF-8 encoded string */
452     Py_ssize_t length,          /* size of string */
453     const char *errors          /* error handling */
454     );
455 
456 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
457     const char *string,         /* UTF-8 encoded string */
458     Py_ssize_t length,          /* size of string */
459     const char *errors,         /* error handling */
460     Py_ssize_t *consumed        /* bytes consumed */
461     );
462 
463 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
464     PyObject *unicode           /* Unicode object */
465     );
466 
467 /* Returns a pointer to the default encoding (UTF-8) of the
468    Unicode object unicode and the size of the encoded representation
469    in bytes stored in *size.
470 
471    In case of an error, no *size is set.
472 
473    This function caches the UTF-8 encoded string in the unicodeobject
474    and subsequent calls will return the same string.  The memory is released
475    when the unicodeobject is deallocated.
476 */
477 
478 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
479 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
480     PyObject *unicode,
481     Py_ssize_t *size);
482 #endif
483 
484 /* --- UTF-32 Codecs ------------------------------------------------------ */
485 
486 /* Decodes length bytes from a UTF-32 encoded buffer string and returns
487    the corresponding Unicode object.
488 
489    errors (if non-NULL) defines the error handling. It defaults
490    to "strict".
491 
492    If byteorder is non-NULL, the decoder starts decoding using the
493    given byte order:
494 
495     *byteorder == -1: little endian
496     *byteorder == 0:  native order
497     *byteorder == 1:  big endian
498 
499    In native mode, the first four bytes of the stream are checked for a
500    BOM mark. If found, the BOM mark is analysed, the byte order
501    adjusted and the BOM skipped.  In the other modes, no BOM mark
502    interpretation is done. After completion, *byteorder is set to the
503    current byte order at the end of input data.
504 
505    If byteorder is NULL, the codec starts in native order mode.
506 
507 */
508 
509 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
510     const char *string,         /* UTF-32 encoded string */
511     Py_ssize_t length,          /* size of string */
512     const char *errors,         /* error handling */
513     int *byteorder              /* pointer to byteorder to use
514                                    0=native;-1=LE,1=BE; updated on
515                                    exit */
516     );
517 
518 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
519     const char *string,         /* UTF-32 encoded string */
520     Py_ssize_t length,          /* size of string */
521     const char *errors,         /* error handling */
522     int *byteorder,             /* pointer to byteorder to use
523                                    0=native;-1=LE,1=BE; updated on
524                                    exit */
525     Py_ssize_t *consumed        /* bytes consumed */
526     );
527 
528 /* Returns a Python string using the UTF-32 encoding in native byte
529    order. The string always starts with a BOM mark.  */
530 
531 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
532     PyObject *unicode           /* Unicode object */
533     );
534 
535 /* Returns a Python string object holding the UTF-32 encoded value of
536    the Unicode data.
537 
538    If byteorder is not 0, output is written according to the following
539    byte order:
540 
541    byteorder == -1: little endian
542    byteorder == 0:  native byte order (writes a BOM mark)
543    byteorder == 1:  big endian
544 
545    If byteorder is 0, the output string will always start with the
546    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
547    prepended.
548 
549 */
550 
551 /* --- UTF-16 Codecs ------------------------------------------------------ */
552 
553 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
554    the corresponding Unicode object.
555 
556    errors (if non-NULL) defines the error handling. It defaults
557    to "strict".
558 
559    If byteorder is non-NULL, the decoder starts decoding using the
560    given byte order:
561 
562     *byteorder == -1: little endian
563     *byteorder == 0:  native order
564     *byteorder == 1:  big endian
565 
566    In native mode, the first two bytes of the stream are checked for a
567    BOM mark. If found, the BOM mark is analysed, the byte order
568    adjusted and the BOM skipped.  In the other modes, no BOM mark
569    interpretation is done. After completion, *byteorder is set to the
570    current byte order at the end of input data.
571 
572    If byteorder is NULL, the codec starts in native order mode.
573 
574 */
575 
576 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
577     const char *string,         /* UTF-16 encoded string */
578     Py_ssize_t length,          /* size of string */
579     const char *errors,         /* error handling */
580     int *byteorder              /* pointer to byteorder to use
581                                    0=native;-1=LE,1=BE; updated on
582                                    exit */
583     );
584 
585 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
586     const char *string,         /* UTF-16 encoded string */
587     Py_ssize_t length,          /* size of string */
588     const char *errors,         /* error handling */
589     int *byteorder,             /* pointer to byteorder to use
590                                    0=native;-1=LE,1=BE; updated on
591                                    exit */
592     Py_ssize_t *consumed        /* bytes consumed */
593     );
594 
595 /* Returns a Python string using the UTF-16 encoding in native byte
596    order. The string always starts with a BOM mark.  */
597 
598 PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
599     PyObject *unicode           /* Unicode object */
600     );
601 
602 /* --- Unicode-Escape Codecs ---------------------------------------------- */
603 
604 PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
605     const char *string,         /* Unicode-Escape encoded string */
606     Py_ssize_t length,          /* size of string */
607     const char *errors          /* error handling */
608     );
609 
610 PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
611     PyObject *unicode           /* Unicode object */
612     );
613 
614 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
615 
616 PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
617     const char *string,         /* Raw-Unicode-Escape encoded string */
618     Py_ssize_t length,          /* size of string */
619     const char *errors          /* error handling */
620     );
621 
622 PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
623     PyObject *unicode           /* Unicode object */
624     );
625 
626 /* --- Latin-1 Codecs -----------------------------------------------------
627 
628    Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
629 
630 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
631     const char *string,         /* Latin-1 encoded string */
632     Py_ssize_t length,          /* size of string */
633     const char *errors          /* error handling */
634     );
635 
636 PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
637     PyObject *unicode           /* Unicode object */
638     );
639 
640 /* --- ASCII Codecs -------------------------------------------------------
641 
642    Only 7-bit ASCII data is excepted. All other codes generate errors.
643 
644 */
645 
646 PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
647     const char *string,         /* ASCII encoded string */
648     Py_ssize_t length,          /* size of string */
649     const char *errors          /* error handling */
650     );
651 
652 PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
653     PyObject *unicode           /* Unicode object */
654     );
655 
656 /* --- Character Map Codecs -----------------------------------------------
657 
658    This codec uses mappings to encode and decode characters.
659 
660    Decoding mappings must map byte ordinals (integers in the range from 0 to
661    255) to Unicode strings, integers (which are then interpreted as Unicode
662    ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
663    as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
664    mapping" and cause an error.
665 
666    Encoding mappings must map Unicode ordinal integers to bytes objects,
667    integers in the range from 0 to 255 or None.  Unmapped character
668    ordinals (ones which cause a LookupError) as well as mapped to
669    None are treated as "undefined mapping" and cause an error.
670 
671 */
672 
673 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
674     const char *string,         /* Encoded string */
675     Py_ssize_t length,          /* size of string */
676     PyObject *mapping,          /* decoding mapping */
677     const char *errors          /* error handling */
678     );
679 
680 PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
681     PyObject *unicode,          /* Unicode object */
682     PyObject *mapping           /* encoding mapping */
683     );
684 
685 /* --- MBCS codecs for Windows -------------------------------------------- */
686 
687 #ifdef MS_WINDOWS
688 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
689     const char *string,         /* MBCS encoded string */
690     Py_ssize_t length,          /* size of string */
691     const char *errors          /* error handling */
692     );
693 
694 PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
695     const char *string,         /* MBCS encoded string */
696     Py_ssize_t length,          /* size of string */
697     const char *errors,         /* error handling */
698     Py_ssize_t *consumed        /* bytes consumed */
699     );
700 
701 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
702 PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
703     int code_page,              /* code page number */
704     const char *string,         /* encoded string */
705     Py_ssize_t length,          /* size of string */
706     const char *errors,         /* error handling */
707     Py_ssize_t *consumed        /* bytes consumed */
708     );
709 #endif
710 
711 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
712     PyObject *unicode           /* Unicode object */
713     );
714 
715 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
716 PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
717     int code_page,              /* code page number */
718     PyObject *unicode,          /* Unicode object */
719     const char *errors          /* error handling */
720     );
721 #endif
722 
723 #endif /* MS_WINDOWS */
724 
725 /* --- Locale encoding --------------------------------------------------- */
726 
727 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
728 /* Decode a string from the current locale encoding. The decoder is strict if
729    *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
730    error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
731    be decoded as a surrogate character and *surrogateescape* is not equal to
732    zero, the byte sequence is escaped using the 'surrogateescape' error handler
733    instead of being decoded. *str* must end with a null character but cannot
734    contain embedded null characters. */
735 
736 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
737     const char *str,
738     Py_ssize_t len,
739     const char *errors);
740 
741 /* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
742    length using strlen(). */
743 
744 PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
745     const char *str,
746     const char *errors);
747 
748 /* Encode a Unicode object to the current locale encoding. The encoder is
749    strict is *surrogateescape* is equal to zero, otherwise the
750    "surrogateescape" error handler is used. Return a bytes object. The string
751    cannot contain embedded null characters. */
752 
753 PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
754     PyObject *unicode,
755     const char *errors
756     );
757 #endif
758 
759 /* --- File system encoding ---------------------------------------------- */
760 
761 /* ParseTuple converter: encode str objects to bytes using
762    PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
763 
764 PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
765 
766 /* ParseTuple converter: decode bytes objects to unicode using
767    PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
768 
769 PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
770 
771 /* Decode a null-terminated string using Py_FileSystemDefaultEncoding
772    and the "surrogateescape" error handler.
773 
774    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
775    encoding.
776 
777    Use PyUnicode_DecodeFSDefaultAndSize() if the string length is known.
778 */
779 
780 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
781     const char *s               /* encoded string */
782     );
783 
784 /* Decode a string using Py_FileSystemDefaultEncoding
785    and the "surrogateescape" error handler.
786 
787    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
788    encoding.
789 */
790 
791 PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
792     const char *s,               /* encoded string */
793     Py_ssize_t size              /* size */
794     );
795 
796 /* Encode a Unicode object to Py_FileSystemDefaultEncoding with the
797    "surrogateescape" error handler, and return bytes.
798 
799    If Py_FileSystemDefaultEncoding is not set, fall back to the locale
800    encoding.
801 */
802 
803 PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
804     PyObject *unicode
805     );
806 
807 /* --- Methods & Slots ----------------------------------------------------
808 
809    These are capable of handling Unicode objects and strings on input
810    (we refer to them as strings in the descriptions) and return
811    Unicode objects or integers as appropriate. */
812 
813 /* Concat two strings giving a new Unicode string. */
814 
815 PyAPI_FUNC(PyObject*) PyUnicode_Concat(
816     PyObject *left,             /* Left string */
817     PyObject *right             /* Right string */
818     );
819 
820 /* Concat two strings and put the result in *pleft
821    (sets *pleft to NULL on error) */
822 
823 PyAPI_FUNC(void) PyUnicode_Append(
824     PyObject **pleft,           /* Pointer to left string */
825     PyObject *right             /* Right string */
826     );
827 
828 /* Concat two strings, put the result in *pleft and drop the right object
829    (sets *pleft to NULL on error) */
830 
831 PyAPI_FUNC(void) PyUnicode_AppendAndDel(
832     PyObject **pleft,           /* Pointer to left string */
833     PyObject *right             /* Right string */
834     );
835 
836 /* Split a string giving a list of Unicode strings.
837 
838    If sep is NULL, splitting will be done at all whitespace
839    substrings. Otherwise, splits occur at the given separator.
840 
841    At most maxsplit splits will be done. If negative, no limit is set.
842 
843    Separators are not included in the resulting list.
844 
845 */
846 
847 PyAPI_FUNC(PyObject*) PyUnicode_Split(
848     PyObject *s,                /* String to split */
849     PyObject *sep,              /* String separator */
850     Py_ssize_t maxsplit         /* Maxsplit count */
851     );
852 
853 /* Dito, but split at line breaks.
854 
855    CRLF is considered to be one line break. Line breaks are not
856    included in the resulting list. */
857 
858 PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
859     PyObject *s,                /* String to split */
860     int keepends                /* If true, line end markers are included */
861     );
862 
863 /* Partition a string using a given separator. */
864 
865 PyAPI_FUNC(PyObject*) PyUnicode_Partition(
866     PyObject *s,                /* String to partition */
867     PyObject *sep               /* String separator */
868     );
869 
870 /* Partition a string using a given separator, searching from the end of the
871    string. */
872 
873 PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
874     PyObject *s,                /* String to partition */
875     PyObject *sep               /* String separator */
876     );
877 
878 /* Split a string giving a list of Unicode strings.
879 
880    If sep is NULL, splitting will be done at all whitespace
881    substrings. Otherwise, splits occur at the given separator.
882 
883    At most maxsplit splits will be done. But unlike PyUnicode_Split
884    PyUnicode_RSplit splits from the end of the string. If negative,
885    no limit is set.
886 
887    Separators are not included in the resulting list.
888 
889 */
890 
891 PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
892     PyObject *s,                /* String to split */
893     PyObject *sep,              /* String separator */
894     Py_ssize_t maxsplit         /* Maxsplit count */
895     );
896 
897 /* Translate a string by applying a character mapping table to it and
898    return the resulting Unicode object.
899 
900    The mapping table must map Unicode ordinal integers to Unicode strings,
901    Unicode ordinal integers or None (causing deletion of the character).
902 
903    Mapping tables may be dictionaries or sequences. Unmapped character
904    ordinals (ones which cause a LookupError) are left untouched and
905    are copied as-is.
906 
907 */
908 
909 PyAPI_FUNC(PyObject *) PyUnicode_Translate(
910     PyObject *str,              /* String */
911     PyObject *table,            /* Translate table */
912     const char *errors          /* error handling */
913     );
914 
915 /* Join a sequence of strings using the given separator and return
916    the resulting Unicode string. */
917 
918 PyAPI_FUNC(PyObject*) PyUnicode_Join(
919     PyObject *separator,        /* Separator string */
920     PyObject *seq               /* Sequence object */
921     );
922 
923 /* Return 1 if substr matches str[start:end] at the given tail end, 0
924    otherwise. */
925 
926 PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
927     PyObject *str,              /* String */
928     PyObject *substr,           /* Prefix or Suffix string */
929     Py_ssize_t start,           /* Start index */
930     Py_ssize_t end,             /* Stop index */
931     int direction               /* Tail end: -1 prefix, +1 suffix */
932     );
933 
934 /* Return the first position of substr in str[start:end] using the
935    given search direction or -1 if not found. -2 is returned in case
936    an error occurred and an exception is set. */
937 
938 PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
939     PyObject *str,              /* String */
940     PyObject *substr,           /* Substring to find */
941     Py_ssize_t start,           /* Start index */
942     Py_ssize_t end,             /* Stop index */
943     int direction               /* Find direction: +1 forward, -1 backward */
944     );
945 
946 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
947 /* Like PyUnicode_Find, but search for single character only. */
948 PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
949     PyObject *str,
950     Py_UCS4 ch,
951     Py_ssize_t start,
952     Py_ssize_t end,
953     int direction
954     );
955 #endif
956 
957 /* Count the number of occurrences of substr in str[start:end]. */
958 
959 PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
960     PyObject *str,              /* String */
961     PyObject *substr,           /* Substring to count */
962     Py_ssize_t start,           /* Start index */
963     Py_ssize_t end              /* Stop index */
964     );
965 
966 /* Replace at most maxcount occurrences of substr in str with replstr
967    and return the resulting Unicode object. */
968 
969 PyAPI_FUNC(PyObject *) PyUnicode_Replace(
970     PyObject *str,              /* String */
971     PyObject *substr,           /* Substring to find */
972     PyObject *replstr,          /* Substring to replace */
973     Py_ssize_t maxcount         /* Max. number of replacements to apply;
974                                    -1 = all */
975     );
976 
977 /* Compare two strings and return -1, 0, 1 for less than, equal,
978    greater than resp.
979    Raise an exception and return -1 on error. */
980 
981 PyAPI_FUNC(int) PyUnicode_Compare(
982     PyObject *left,             /* Left string */
983     PyObject *right             /* Right string */
984     );
985 
986 /* Compare a Unicode object with C string and return -1, 0, 1 for less than,
987    equal, and greater than, respectively.  It is best to pass only
988    ASCII-encoded strings, but the function interprets the input string as
989    ISO-8859-1 if it contains non-ASCII characters.
990    This function does not raise exceptions. */
991 
992 PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
993     PyObject *left,
994     const char *right           /* ASCII-encoded string */
995     );
996 
997 /* Rich compare two strings and return one of the following:
998 
999    - NULL in case an exception was raised
1000    - Py_True or Py_False for successful comparisons
1001    - Py_NotImplemented in case the type combination is unknown
1002 
1003    Possible values for op:
1004 
1005      Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
1006 
1007 */
1008 
1009 PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
1010     PyObject *left,             /* Left string */
1011     PyObject *right,            /* Right string */
1012     int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
1013     );
1014 
1015 /* Apply an argument tuple or dictionary to a format string and return
1016    the resulting Unicode string. */
1017 
1018 PyAPI_FUNC(PyObject *) PyUnicode_Format(
1019     PyObject *format,           /* Format string */
1020     PyObject *args              /* Argument tuple or dictionary */
1021     );
1022 
1023 /* Checks whether element is contained in container and return 1/0
1024    accordingly.
1025 
1026    element has to coerce to a one element Unicode string. -1 is
1027    returned in case of an error. */
1028 
1029 PyAPI_FUNC(int) PyUnicode_Contains(
1030     PyObject *container,        /* Container string */
1031     PyObject *element           /* Element string */
1032     );
1033 
1034 /* Checks whether argument is a valid identifier. */
1035 
1036 PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
1037 
1038 /* === Characters Type APIs =============================================== */
1039 
1040 #ifndef Py_LIMITED_API
1041 #  define Py_CPYTHON_UNICODEOBJECT_H
1042 #  include "cpython/unicodeobject.h"
1043 #  undef Py_CPYTHON_UNICODEOBJECT_H
1044 #endif
1045 
1046 #ifdef __cplusplus
1047 }
1048 #endif
1049 #endif /* !Py_UNICODEOBJECT_H */
1050