1 #ifndef Py_CPYTHON_UNICODEOBJECT_H
2 #  error "this header file must not be included directly"
3 #endif
4 
5 /* Py_UNICODE was the native Unicode storage format (code unit) used by
6    Python and represents a single Unicode element in the Unicode type.
7    With PEP 393, Py_UNICODE is deprecated and replaced with a
8    typedef to wchar_t. */
9 #define PY_UNICODE_TYPE wchar_t
10 /* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
11 
12 /* --- Internal Unicode Operations ---------------------------------------- */
13 
14 #ifndef USE_UNICODE_WCHAR_CACHE
15 #  define USE_UNICODE_WCHAR_CACHE 1
16 #endif /* USE_UNICODE_WCHAR_CACHE */
17 
18 /* Since splitting on whitespace is an important use case, and
19    whitespace in most situations is solely ASCII whitespace, we
20    optimize for the common case by using a quick look-up table
21    _Py_ascii_whitespace (see below) with an inlined check.
22 
23  */
24 #define Py_UNICODE_ISSPACE(ch) \
25     ((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))
26 
27 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
28 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
29 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
30 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
31 
32 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
33 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
34 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
35 
36 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
37 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
38 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
39 #define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
40 
41 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
42 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
43 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
44 
45 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
46 
47 #define Py_UNICODE_ISALNUM(ch) \
48    (Py_UNICODE_ISALPHA(ch) || \
49     Py_UNICODE_ISDECIMAL(ch) || \
50     Py_UNICODE_ISDIGIT(ch) || \
51     Py_UNICODE_ISNUMERIC(ch))
52 
53 /* macros to work with surrogates */
54 #define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
55 #define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
56 #define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
57 /* Join two surrogate characters and return a single Py_UCS4 value. */
58 #define Py_UNICODE_JOIN_SURROGATES(high, low)  \
59     (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
60       ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
61 /* high surrogate = top 10 bits added to D800 */
62 #define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
63 /* low surrogate = bottom 10 bits added to DC00 */
64 #define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
65 
66 /* --- Unicode Type ------------------------------------------------------- */
67 
68 /* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
69    structure. state.ascii and state.compact are set, and the data
70    immediately follow the structure. utf8_length and wstr_length can be found
71    in the length field; the utf8 pointer is equal to the data pointer. */
72 typedef struct {
73     /* There are 4 forms of Unicode strings:
74 
75        - compact ascii:
76 
77          * structure = PyASCIIObject
78          * test: PyUnicode_IS_COMPACT_ASCII(op)
79          * kind = PyUnicode_1BYTE_KIND
80          * compact = 1
81          * ascii = 1
82          * ready = 1
83          * (length is the length of the utf8 and wstr strings)
84          * (data starts just after the structure)
85          * (since ASCII is decoded from UTF-8, the utf8 string are the data)
86 
87        - compact:
88 
89          * structure = PyCompactUnicodeObject
90          * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
91          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
92            PyUnicode_4BYTE_KIND
93          * compact = 1
94          * ready = 1
95          * ascii = 0
96          * utf8 is not shared with data
97          * utf8_length = 0 if utf8 is NULL
98          * wstr is shared with data and wstr_length=length
99            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
100            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_t)=4
101          * wstr_length = 0 if wstr is NULL
102          * (data starts just after the structure)
103 
104        - legacy string, not ready:
105 
106          * structure = PyUnicodeObject
107          * test: kind == PyUnicode_WCHAR_KIND
108          * length = 0 (use wstr_length)
109          * hash = -1
110          * kind = PyUnicode_WCHAR_KIND
111          * compact = 0
112          * ascii = 0
113          * ready = 0
114          * interned = SSTATE_NOT_INTERNED
115          * wstr is not NULL
116          * data.any is NULL
117          * utf8 is NULL
118          * utf8_length = 0
119 
120        - legacy string, ready:
121 
122          * structure = PyUnicodeObject structure
123          * test: !PyUnicode_IS_COMPACT(op) && kind != PyUnicode_WCHAR_KIND
124          * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
125            PyUnicode_4BYTE_KIND
126          * compact = 0
127          * ready = 1
128          * data.any is not NULL
129          * utf8 is shared and utf8_length = length with data.any if ascii = 1
130          * utf8_length = 0 if utf8 is NULL
131          * wstr is shared with data.any and wstr_length = length
132            if kind=PyUnicode_2BYTE_KIND and sizeof(wchar_t)=2
133            or if kind=PyUnicode_4BYTE_KIND and sizeof(wchar_4)=4
134          * wstr_length = 0 if wstr is NULL
135 
136        Compact strings use only one memory block (structure + characters),
137        whereas legacy strings use one block for the structure and one block
138        for characters.
139 
140        Legacy strings are created by PyUnicode_FromUnicode() and
141        PyUnicode_FromStringAndSize(NULL, size) functions. They become ready
142        when PyUnicode_READY() is called.
143 
144        See also _PyUnicode_CheckConsistency().
145     */
146     PyObject_HEAD
147     Py_ssize_t length;          /* Number of code points in the string */
148     Py_hash_t hash;             /* Hash value; -1 if not set */
149     struct {
150         /*
151            SSTATE_NOT_INTERNED (0)
152            SSTATE_INTERNED_MORTAL (1)
153            SSTATE_INTERNED_IMMORTAL (2)
154 
155            If interned != SSTATE_NOT_INTERNED, the two references from the
156            dictionary to this object are *not* counted in ob_refcnt.
157          */
158         unsigned int interned:2;
159         /* Character size:
160 
161            - PyUnicode_WCHAR_KIND (0):
162 
163              * character type = wchar_t (16 or 32 bits, depending on the
164                platform)
165 
166            - PyUnicode_1BYTE_KIND (1):
167 
168              * character type = Py_UCS1 (8 bits, unsigned)
169              * all characters are in the range U+0000-U+00FF (latin1)
170              * if ascii is set, all characters are in the range U+0000-U+007F
171                (ASCII), otherwise at least one character is in the range
172                U+0080-U+00FF
173 
174            - PyUnicode_2BYTE_KIND (2):
175 
176              * character type = Py_UCS2 (16 bits, unsigned)
177              * all characters are in the range U+0000-U+FFFF (BMP)
178              * at least one character is in the range U+0100-U+FFFF
179 
180            - PyUnicode_4BYTE_KIND (4):
181 
182              * character type = Py_UCS4 (32 bits, unsigned)
183              * all characters are in the range U+0000-U+10FFFF
184              * at least one character is in the range U+10000-U+10FFFF
185          */
186         unsigned int kind:3;
187         /* Compact is with respect to the allocation scheme. Compact unicode
188            objects only require one memory block while non-compact objects use
189            one block for the PyUnicodeObject struct and another for its data
190            buffer. */
191         unsigned int compact:1;
192         /* The string only contains characters in the range U+0000-U+007F (ASCII)
193            and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
194            set, use the PyASCIIObject structure. */
195         unsigned int ascii:1;
196         /* The ready flag indicates whether the object layout is initialized
197            completely. This means that this is either a compact object, or
198            the data pointer is filled out. The bit is redundant, and helps
199            to minimize the test in PyUnicode_IS_READY(). */
200         unsigned int ready:1;
201         /* Padding to ensure that PyUnicode_DATA() is always aligned to
202            4 bytes (see issue #19537 on m68k). */
203         unsigned int :24;
204     } state;
205     wchar_t *wstr;              /* wchar_t representation (null-terminated) */
206 } PyASCIIObject;
207 
208 /* Non-ASCII strings allocated through PyUnicode_New use the
209    PyCompactUnicodeObject structure. state.compact is set, and the data
210    immediately follow the structure. */
211 typedef struct {
212     PyASCIIObject _base;
213     Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
214                                  * terminating \0. */
215     char *utf8;                 /* UTF-8 representation (null-terminated) */
216     Py_ssize_t wstr_length;     /* Number of code points in wstr, possible
217                                  * surrogates count as two code points. */
218 } PyCompactUnicodeObject;
219 
220 /* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
221    PyUnicodeObject structure. The actual string data is initially in the wstr
222    block, and copied into the data block using _PyUnicode_Ready. */
223 typedef struct {
224     PyCompactUnicodeObject _base;
225     union {
226         void *any;
227         Py_UCS1 *latin1;
228         Py_UCS2 *ucs2;
229         Py_UCS4 *ucs4;
230     } data;                     /* Canonical, smallest-form Unicode buffer */
231 } PyUnicodeObject;
232 
233 PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
234     PyObject *op,
235     int check_content);
236 
237 
238 #define _PyASCIIObject_CAST(op) \
239     (assert(PyUnicode_Check(op)), \
240      _Py_CAST(PyASCIIObject*, (op)))
241 #define _PyCompactUnicodeObject_CAST(op) \
242     (assert(PyUnicode_Check(op)), \
243      _Py_CAST(PyCompactUnicodeObject*, (op)))
244 #define _PyUnicodeObject_CAST(op) \
245     (assert(PyUnicode_Check(op)), \
246      _Py_CAST(PyUnicodeObject*, (op)))
247 
248 
249 /* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
250 
251 /* Values for PyASCIIObject.state: */
252 
253 /* Interning state. */
254 #define SSTATE_NOT_INTERNED 0
255 #define SSTATE_INTERNED_MORTAL 1
256 #define SSTATE_INTERNED_IMMORTAL 2
257 
258 /* Use only if you know it's a string */
PyUnicode_CHECK_INTERNED(PyObject * op)259 static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
260     return _PyASCIIObject_CAST(op)->state.interned;
261 }
262 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
263 #  define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
264 #endif
265 
266 /* Fast check to determine whether an object is ready. Equivalent to:
267    PyUnicode_IS_COMPACT(op) || _PyUnicodeObject_CAST(op)->data.any */
PyUnicode_IS_READY(PyObject * op)268 static inline unsigned int PyUnicode_IS_READY(PyObject *op) {
269     return _PyASCIIObject_CAST(op)->state.ready;
270 }
271 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
272 #  define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
273 #endif
274 
275 /* Return true if the string contains only ASCII characters, or 0 if not. The
276    string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
277    ready. */
PyUnicode_IS_ASCII(PyObject * op)278 static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
279     assert(PyUnicode_IS_READY(op));
280     return _PyASCIIObject_CAST(op)->state.ascii;
281 }
282 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
283 #  define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
284 #endif
285 
286 /* Return true if the string is compact or 0 if not.
287    No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT(PyObject * op)288 static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
289     return _PyASCIIObject_CAST(op)->state.compact;
290 }
291 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
292 #  define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
293 #endif
294 
295 /* Return true if the string is a compact ASCII string (use PyASCIIObject
296    structure), or 0 if not.  No type checks or Ready calls are performed. */
PyUnicode_IS_COMPACT_ASCII(PyObject * op)297 static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
298     return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
299 }
300 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
301 #  define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
302 #endif
303 
304 enum PyUnicode_Kind {
305 /* String contains only wstr byte characters.  This is only possible
306    when the string was created with a legacy API and _PyUnicode_Ready()
307    has not been called yet.  */
308     PyUnicode_WCHAR_KIND = 0,
309 /* Return values of the PyUnicode_KIND() function: */
310     PyUnicode_1BYTE_KIND = 1,
311     PyUnicode_2BYTE_KIND = 2,
312     PyUnicode_4BYTE_KIND = 4
313 };
314 
315 /* Return one of the PyUnicode_*_KIND values defined above. */
316 #define PyUnicode_KIND(op) \
317     (assert(PyUnicode_IS_READY(op)), \
318      _PyASCIIObject_CAST(op)->state.kind)
319 
320 /* Return a void pointer to the raw unicode buffer. */
_PyUnicode_COMPACT_DATA(PyObject * op)321 static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
322     if (PyUnicode_IS_ASCII(op)) {
323         return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
324     }
325     return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
326 }
327 
_PyUnicode_NONCOMPACT_DATA(PyObject * op)328 static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
329     void *data;
330     assert(!PyUnicode_IS_COMPACT(op));
331     data = _PyUnicodeObject_CAST(op)->data.any;
332     assert(data != NULL);
333     return data;
334 }
335 
PyUnicode_DATA(PyObject * op)336 static inline void* PyUnicode_DATA(PyObject *op) {
337     if (PyUnicode_IS_COMPACT(op)) {
338         return _PyUnicode_COMPACT_DATA(op);
339     }
340     return _PyUnicode_NONCOMPACT_DATA(op);
341 }
342 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
343 #  define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
344 #endif
345 
346 /* Return pointers to the canonical representation cast to unsigned char,
347    Py_UCS2, or Py_UCS4 for direct character access.
348    No checks are performed, use PyUnicode_KIND() before to ensure
349    these will work correctly. */
350 
351 #define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
352 #define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
353 #define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
354 
355 /* Returns the length of the unicode string. The caller has to make sure that
356    the string has it's canonical representation set before calling
357    this function.  Call PyUnicode_(FAST_)Ready to ensure that. */
PyUnicode_GET_LENGTH(PyObject * op)358 static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
359     assert(PyUnicode_IS_READY(op));
360     return _PyASCIIObject_CAST(op)->length;
361 }
362 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
363 #  define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
364 #endif
365 
366 /* Write into the canonical representation, this function does not do any sanity
367    checks and is intended for usage in loops.  The caller should cache the
368    kind and data pointers obtained from other function calls.
369    index is the index in the string (starts at 0) and value is the new
370    code point value which should be written to that location. */
PyUnicode_WRITE(int kind,void * data,Py_ssize_t index,Py_UCS4 value)371 static inline void PyUnicode_WRITE(int kind, void *data,
372                                    Py_ssize_t index, Py_UCS4 value)
373 {
374     if (kind == PyUnicode_1BYTE_KIND) {
375         assert(value <= 0xffU);
376         _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
377     }
378     else if (kind == PyUnicode_2BYTE_KIND) {
379         assert(value <= 0xffffU);
380         _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
381     }
382     else {
383         assert(kind == PyUnicode_4BYTE_KIND);
384         assert(value <= 0x10ffffU);
385         _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
386     }
387 }
388 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
389 #define PyUnicode_WRITE(kind, data, index, value) \
390     PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
391                     (index), _Py_STATIC_CAST(Py_UCS4, value))
392 #endif
393 
394 /* Read a code point from the string's canonical representation.  No checks
395    or ready calls are performed. */
PyUnicode_READ(int kind,const void * data,Py_ssize_t index)396 static inline Py_UCS4 PyUnicode_READ(int kind,
397                                      const void *data, Py_ssize_t index)
398 {
399     if (kind == PyUnicode_1BYTE_KIND) {
400         return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
401     }
402     if (kind == PyUnicode_2BYTE_KIND) {
403         return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
404     }
405     assert(kind == PyUnicode_4BYTE_KIND);
406     return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
407 }
408 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
409 #define PyUnicode_READ(kind, data, index) \
410     PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
411                    _Py_STATIC_CAST(const void*, data), \
412                    (index))
413 #endif
414 
415 /* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
416    calls PyUnicode_KIND() and might call it twice.  For single reads, use
417    PyUnicode_READ_CHAR, for multiple consecutive reads callers should
418    cache kind and use PyUnicode_READ instead. */
PyUnicode_READ_CHAR(PyObject * unicode,Py_ssize_t index)419 static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
420 {
421     int kind;
422     assert(PyUnicode_IS_READY(unicode));
423     kind = PyUnicode_KIND(unicode);
424     if (kind == PyUnicode_1BYTE_KIND) {
425         return PyUnicode_1BYTE_DATA(unicode)[index];
426     }
427     if (kind == PyUnicode_2BYTE_KIND) {
428         return PyUnicode_2BYTE_DATA(unicode)[index];
429     }
430     assert(kind == PyUnicode_4BYTE_KIND);
431     return PyUnicode_4BYTE_DATA(unicode)[index];
432 }
433 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
434 #  define PyUnicode_READ_CHAR(unicode, index) \
435        PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
436 #endif
437 
438 /* Return a maximum character value which is suitable for creating another
439    string based on op.  This is always an approximation but more efficient
440    than iterating over the string. */
PyUnicode_MAX_CHAR_VALUE(PyObject * op)441 static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
442 {
443     int kind;
444 
445     assert(PyUnicode_IS_READY(op));
446     if (PyUnicode_IS_ASCII(op)) {
447         return 0x7fU;
448     }
449 
450     kind = PyUnicode_KIND(op);
451     if (kind == PyUnicode_1BYTE_KIND) {
452        return 0xffU;
453     }
454     if (kind == PyUnicode_2BYTE_KIND) {
455         return 0xffffU;
456     }
457     assert(kind == PyUnicode_4BYTE_KIND);
458     return 0x10ffffU;
459 }
460 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
461 #  define PyUnicode_MAX_CHAR_VALUE(op) \
462        PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
463 #endif
464 
465 /* === Public API ========================================================= */
466 
467 /* --- Plain Py_UNICODE --------------------------------------------------- */
468 
469 /* With PEP 393, this is the recommended way to allocate a new unicode object.
470    This function will allocate the object and its buffer in a single memory
471    block.  Objects created using this function are not resizable. */
472 PyAPI_FUNC(PyObject*) PyUnicode_New(
473     Py_ssize_t size,            /* Number of code points in the new string */
474     Py_UCS4 maxchar             /* maximum code point value in the string */
475     );
476 
477 /* Initializes the canonical string representation from the deprecated
478    wstr/Py_UNICODE representation. This function is used to convert Unicode
479    objects which were created using the old API to the new flexible format
480    introduced with PEP 393.
481 
482    Don't call this function directly, use the public PyUnicode_READY() function
483    instead. */
484 PyAPI_FUNC(int) _PyUnicode_Ready(
485     PyObject *unicode           /* Unicode object */
486     );
487 
488 /* PyUnicode_READY() does less work than _PyUnicode_Ready() in the best
489    case.  If the canonical representation is not yet set, it will still call
490    _PyUnicode_Ready().
491    Returns 0 on success and -1 on errors. */
PyUnicode_READY(PyObject * op)492 static inline int PyUnicode_READY(PyObject *op)
493 {
494     if (PyUnicode_IS_READY(op)) {
495         return 0;
496     }
497     return _PyUnicode_Ready(op);
498 }
499 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
500 #  define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
501 #endif
502 
503 /* Get a copy of a Unicode string. */
504 PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
505     PyObject *unicode
506     );
507 
508 /* Copy character from one unicode object into another, this function performs
509    character conversion when necessary and falls back to memcpy() if possible.
510 
511    Fail if to is too small (smaller than *how_many* or smaller than
512    len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
513    kind(to), or if *to* has more than 1 reference.
514 
515    Return the number of written character, or return -1 and raise an exception
516    on error.
517 
518    Pseudo-code:
519 
520        how_many = min(how_many, len(from) - from_start)
521        to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
522        return how_many
523 
524    Note: The function doesn't write a terminating null character.
525    */
526 PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
527     PyObject *to,
528     Py_ssize_t to_start,
529     PyObject *from,
530     Py_ssize_t from_start,
531     Py_ssize_t how_many
532     );
533 
534 /* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
535    may crash if parameters are invalid (e.g. if the output string
536    is too short). */
537 PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
538     PyObject *to,
539     Py_ssize_t to_start,
540     PyObject *from,
541     Py_ssize_t from_start,
542     Py_ssize_t how_many
543     );
544 
545 /* Fill a string with a character: write fill_char into
546    unicode[start:start+length].
547 
548    Fail if fill_char is bigger than the string maximum character, or if the
549    string has more than 1 reference.
550 
551    Return the number of written character, or return -1 and raise an exception
552    on error. */
553 PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
554     PyObject *unicode,
555     Py_ssize_t start,
556     Py_ssize_t length,
557     Py_UCS4 fill_char
558     );
559 
560 /* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
561    if parameters are invalid (e.g. if length is longer than the string). */
562 PyAPI_FUNC(void) _PyUnicode_FastFill(
563     PyObject *unicode,
564     Py_ssize_t start,
565     Py_ssize_t length,
566     Py_UCS4 fill_char
567     );
568 
569 /* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
570    Scan the string to find the maximum character. */
571 PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
572     int kind,
573     const void *buffer,
574     Py_ssize_t size);
575 
576 /* Create a new string from a buffer of ASCII characters.
577    WARNING: Don't check if the string contains any non-ASCII character. */
578 PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
579     const char *buffer,
580     Py_ssize_t size);
581 
582 /* Compute the maximum character of the substring unicode[start:end].
583    Return 127 for an empty string. */
584 PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
585     PyObject *unicode,
586     Py_ssize_t start,
587     Py_ssize_t end);
588 
589 /* --- Legacy deprecated API ---------------------------------------------- */
590 
591 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
592    size.
593 
594    u may be NULL which causes the contents to be undefined. It is the
595    user's responsibility to fill in the needed data afterwards. Note
596    that modifying the Unicode object contents after construction is
597    only allowed if u was set to NULL.
598 
599    The buffer is copied into the new object. */
600 Py_DEPRECATED(3.3) PyAPI_FUNC(PyObject*) PyUnicode_FromUnicode(
601     const Py_UNICODE *u,        /* Unicode buffer */
602     Py_ssize_t size             /* size of buffer */
603     );
604 
605 /* Return a read-only pointer to the Unicode object's internal
606    Py_UNICODE buffer.
607    If the wchar_t/Py_UNICODE representation is not yet available, this
608    function will calculate it. */
609 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
610     PyObject *unicode           /* Unicode object */
611     );
612 
613 /* Similar to PyUnicode_AsUnicode(), but raises a ValueError if the string
614    contains null characters. */
615 PyAPI_FUNC(const Py_UNICODE *) _PyUnicode_AsUnicode(
616     PyObject *unicode           /* Unicode object */
617     );
618 
619 /* Return a read-only pointer to the Unicode object's internal
620    Py_UNICODE buffer and save the length at size.
621    If the wchar_t/Py_UNICODE representation is not yet available, this
622    function will calculate it. */
623 
624 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
625     PyObject *unicode,          /* Unicode object */
626     Py_ssize_t *size            /* location where to save the length */
627     );
628 
629 
630 /* Fast access macros */
631 
632 Py_DEPRECATED(3.3)
PyUnicode_WSTR_LENGTH(PyObject * op)633 static inline Py_ssize_t PyUnicode_WSTR_LENGTH(PyObject *op)
634 {
635     if (PyUnicode_IS_COMPACT_ASCII(op)) {
636         return _PyASCIIObject_CAST(op)->length;
637     }
638     else {
639         return _PyCompactUnicodeObject_CAST(op)->wstr_length;
640     }
641 }
642 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
643 #  define PyUnicode_WSTR_LENGTH(op) PyUnicode_WSTR_LENGTH(_PyObject_CAST(op))
644 #endif
645 
646 /* Returns the deprecated Py_UNICODE representation's size in code units
647    (this includes surrogate pairs as 2 units).
648    If the Py_UNICODE representation is not available, it will be computed
649    on request.  Use PyUnicode_GET_LENGTH() for the length in code points. */
650 
651 Py_DEPRECATED(3.3)
PyUnicode_GET_SIZE(PyObject * op)652 static inline Py_ssize_t PyUnicode_GET_SIZE(PyObject *op)
653 {
654     _Py_COMP_DIAG_PUSH
655     _Py_COMP_DIAG_IGNORE_DEPR_DECLS
656     if (_PyASCIIObject_CAST(op)->wstr == _Py_NULL) {
657         (void)PyUnicode_AsUnicode(op);
658         assert(_PyASCIIObject_CAST(op)->wstr != _Py_NULL);
659     }
660     return PyUnicode_WSTR_LENGTH(op);
661     _Py_COMP_DIAG_POP
662 }
663 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
664 #  define PyUnicode_GET_SIZE(op) PyUnicode_GET_SIZE(_PyObject_CAST(op))
665 #endif
666 
667 Py_DEPRECATED(3.3)
PyUnicode_GET_DATA_SIZE(PyObject * op)668 static inline Py_ssize_t PyUnicode_GET_DATA_SIZE(PyObject *op)
669 {
670     _Py_COMP_DIAG_PUSH
671     _Py_COMP_DIAG_IGNORE_DEPR_DECLS
672     return PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE;
673     _Py_COMP_DIAG_POP
674 }
675 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
676 #  define PyUnicode_GET_DATA_SIZE(op) PyUnicode_GET_DATA_SIZE(_PyObject_CAST(op))
677 #endif
678 
679 /* Alias for PyUnicode_AsUnicode().  This will create a wchar_t/Py_UNICODE
680    representation on demand.  Using this macro is very inefficient now,
681    try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
682    use PyUnicode_WRITE() and PyUnicode_READ(). */
683 
684 Py_DEPRECATED(3.3)
PyUnicode_AS_UNICODE(PyObject * op)685 static inline Py_UNICODE* PyUnicode_AS_UNICODE(PyObject *op)
686 {
687     wchar_t *wstr = _PyASCIIObject_CAST(op)->wstr;
688     if (wstr != _Py_NULL) {
689         return wstr;
690     }
691 
692     _Py_COMP_DIAG_PUSH
693     _Py_COMP_DIAG_IGNORE_DEPR_DECLS
694     return PyUnicode_AsUnicode(op);
695     _Py_COMP_DIAG_POP
696 }
697 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
698 #  define PyUnicode_AS_UNICODE(op) PyUnicode_AS_UNICODE(_PyObject_CAST(op))
699 #endif
700 
701 Py_DEPRECATED(3.3)
PyUnicode_AS_DATA(PyObject * op)702 static inline const char* PyUnicode_AS_DATA(PyObject *op)
703 {
704     _Py_COMP_DIAG_PUSH
705     _Py_COMP_DIAG_IGNORE_DEPR_DECLS
706     Py_UNICODE *data = PyUnicode_AS_UNICODE(op);
707     // In C++, casting directly PyUnicode* to const char* is not valid
708     return _Py_STATIC_CAST(const char*, _Py_STATIC_CAST(const void*, data));
709     _Py_COMP_DIAG_POP
710 }
711 #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
712 #  define PyUnicode_AS_DATA(op) PyUnicode_AS_DATA(_PyObject_CAST(op))
713 #endif
714 
715 
716 /* --- _PyUnicodeWriter API ----------------------------------------------- */
717 
718 typedef struct {
719     PyObject *buffer;
720     void *data;
721     enum PyUnicode_Kind kind;
722     Py_UCS4 maxchar;
723     Py_ssize_t size;
724     Py_ssize_t pos;
725 
726     /* minimum number of allocated characters (default: 0) */
727     Py_ssize_t min_length;
728 
729     /* minimum character (default: 127, ASCII) */
730     Py_UCS4 min_char;
731 
732     /* If non-zero, overallocate the buffer (default: 0). */
733     unsigned char overallocate;
734 
735     /* If readonly is 1, buffer is a shared string (cannot be modified)
736        and size is set to 0. */
737     unsigned char readonly;
738 } _PyUnicodeWriter ;
739 
740 /* Initialize a Unicode writer.
741  *
742  * By default, the minimum buffer size is 0 character and overallocation is
743  * disabled. Set min_length, min_char and overallocate attributes to control
744  * the allocation of the buffer. */
745 PyAPI_FUNC(void)
746 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
747 
748 /* Prepare the buffer to write 'length' characters
749    with the specified maximum character.
750 
751    Return 0 on success, raise an exception and return -1 on error. */
752 #define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
753     (((MAXCHAR) <= (WRITER)->maxchar                                  \
754       && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
755      ? 0                                                              \
756      : (((LENGTH) == 0)                                               \
757         ? 0                                                           \
758         : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
759 
760 /* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
761    instead. */
762 PyAPI_FUNC(int)
763 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
764                                  Py_ssize_t length, Py_UCS4 maxchar);
765 
766 /* Prepare the buffer to have at least the kind KIND.
767    For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
768    support characters in range U+000-U+FFFF.
769 
770    Return 0 on success, raise an exception and return -1 on error. */
771 #define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
772     (assert((KIND) != PyUnicode_WCHAR_KIND),                          \
773      (KIND) <= (WRITER)->kind                                         \
774      ? 0                                                              \
775      : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
776 
777 /* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
778    macro instead. */
779 PyAPI_FUNC(int)
780 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
781                                      enum PyUnicode_Kind kind);
782 
783 /* Append a Unicode character.
784    Return 0 on success, raise an exception and return -1 on error. */
785 PyAPI_FUNC(int)
786 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
787     Py_UCS4 ch
788     );
789 
790 /* Append a Unicode string.
791    Return 0 on success, raise an exception and return -1 on error. */
792 PyAPI_FUNC(int)
793 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
794     PyObject *str               /* Unicode string */
795     );
796 
797 /* Append a substring of a Unicode string.
798    Return 0 on success, raise an exception and return -1 on error. */
799 PyAPI_FUNC(int)
800 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
801     PyObject *str,              /* Unicode string */
802     Py_ssize_t start,
803     Py_ssize_t end
804     );
805 
806 /* Append an ASCII-encoded byte string.
807    Return 0 on success, raise an exception and return -1 on error. */
808 PyAPI_FUNC(int)
809 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
810     const char *str,           /* ASCII-encoded byte string */
811     Py_ssize_t len             /* number of bytes, or -1 if unknown */
812     );
813 
814 /* Append a latin1-encoded byte string.
815    Return 0 on success, raise an exception and return -1 on error. */
816 PyAPI_FUNC(int)
817 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
818     const char *str,           /* latin1-encoded byte string */
819     Py_ssize_t len             /* length in bytes */
820     );
821 
822 /* Get the value of the writer as a Unicode string. Clear the
823    buffer of the writer. Raise an exception and return NULL
824    on error. */
825 PyAPI_FUNC(PyObject *)
826 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
827 
828 /* Deallocate memory of a writer (clear its internal buffer). */
829 PyAPI_FUNC(void)
830 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
831 
832 
833 /* Format the object based on the format_spec, as defined in PEP 3101
834    (Advanced String Formatting). */
835 PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
836     _PyUnicodeWriter *writer,
837     PyObject *obj,
838     PyObject *format_spec,
839     Py_ssize_t start,
840     Py_ssize_t end);
841 
842 /* --- Manage the default encoding ---------------------------------------- */
843 
844 /* Returns a pointer to the default encoding (UTF-8) of the
845    Unicode object unicode.
846 
847    Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
848    in the unicodeobject.
849 
850    _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
851    support the previous internal function with the same behaviour.
852 
853    Use of this API is DEPRECATED since no size information can be
854    extracted from the returned data.
855 */
856 
857 PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
858 
859 #define _PyUnicode_AsString PyUnicode_AsUTF8
860 
861 /* --- UTF-7 Codecs ------------------------------------------------------- */
862 
863 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
864     PyObject *unicode,          /* Unicode object */
865     int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
866     int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
867     const char *errors          /* error handling */
868     );
869 
870 /* --- UTF-8 Codecs ------------------------------------------------------- */
871 
872 PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
873     PyObject *unicode,
874     const char *errors);
875 
876 /* --- UTF-32 Codecs ------------------------------------------------------ */
877 
878 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
879     PyObject *object,           /* Unicode object */
880     const char *errors,         /* error handling */
881     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
882     );
883 
884 /* --- UTF-16 Codecs ------------------------------------------------------ */
885 
886 /* Returns a Python string object holding the UTF-16 encoded value of
887    the Unicode data.
888 
889    If byteorder is not 0, output is written according to the following
890    byte order:
891 
892    byteorder == -1: little endian
893    byteorder == 0:  native byte order (writes a BOM mark)
894    byteorder == 1:  big endian
895 
896    If byteorder is 0, the output string will always start with the
897    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
898    prepended.
899 */
900 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
901     PyObject* unicode,          /* Unicode object */
902     const char *errors,         /* error handling */
903     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
904     );
905 
906 /* --- Unicode-Escape Codecs ---------------------------------------------- */
907 
908 /* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
909 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
910         const char *string,     /* Unicode-Escape encoded string */
911         Py_ssize_t length,      /* size of string */
912         const char *errors,     /* error handling */
913         Py_ssize_t *consumed    /* bytes consumed */
914 );
915 /* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
916    chars. */
917 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
918         const char *string,     /* Unicode-Escape encoded string */
919         Py_ssize_t length,      /* size of string */
920         const char *errors,     /* error handling */
921         Py_ssize_t *consumed,   /* bytes consumed */
922         const char **first_invalid_escape  /* on return, points to first
923                                               invalid escaped char in
924                                               string. */
925 );
926 
927 /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
928 
929 /* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
930 PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
931         const char *string,     /* Unicode-Escape encoded string */
932         Py_ssize_t length,      /* size of string */
933         const char *errors,     /* error handling */
934         Py_ssize_t *consumed    /* bytes consumed */
935 );
936 
937 /* --- Latin-1 Codecs ----------------------------------------------------- */
938 
939 PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
940     PyObject* unicode,
941     const char* errors);
942 
943 /* --- ASCII Codecs ------------------------------------------------------- */
944 
945 PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
946     PyObject* unicode,
947     const char* errors);
948 
949 /* --- Character Map Codecs ----------------------------------------------- */
950 
951 /* Translate an Unicode object by applying a character mapping table to
952    it and return the resulting Unicode object.
953 
954    The mapping table must map Unicode ordinal integers to Unicode strings,
955    Unicode ordinal integers or None (causing deletion of the character).
956 
957    Mapping tables may be dictionaries or sequences. Unmapped character
958    ordinals (ones which cause a LookupError) are left untouched and
959    are copied as-is.
960 */
961 PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
962     PyObject *unicode,          /* Unicode object */
963     PyObject *mapping,          /* encoding mapping */
964     const char *errors          /* error handling */
965     );
966 
967 /* --- Decimal Encoder ---------------------------------------------------- */
968 
969 /* Coverts a Unicode object holding a decimal value to an ASCII string
970    for using in int, float and complex parsers.
971    Transforms code points that have decimal digit property to the
972    corresponding ASCII digit code points.  Transforms spaces to ASCII.
973    Transforms code points starting from the first non-ASCII code point that
974    is neither a decimal digit nor a space to the end into '?'. */
975 
976 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
977     PyObject *unicode           /* Unicode object */
978     );
979 
980 /* --- Methods & Slots ---------------------------------------------------- */
981 
982 PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
983     PyObject *separator,
984     PyObject *const *items,
985     Py_ssize_t seqlen
986     );
987 
988 /* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
989    0 otherwise.  The right argument must be ASCII identifier.
990    Any error occurs inside will be cleared before return. */
991 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
992     PyObject *left,             /* Left string */
993     _Py_Identifier *right       /* Right identifier */
994     );
995 
996 /* Test whether a unicode is equal to ASCII string.  Return 1 if true,
997    0 otherwise.  The right argument must be ASCII-encoded string.
998    Any error occurs inside will be cleared before return. */
999 PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
1000     PyObject *left,
1001     const char *right           /* ASCII-encoded string */
1002     );
1003 
1004 /* Externally visible for str.strip(unicode) */
1005 PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
1006     PyObject *self,
1007     int striptype,
1008     PyObject *sepobj
1009     );
1010 
1011 /* Using explicit passed-in values, insert the thousands grouping
1012    into the string pointed to by buffer.  For the argument descriptions,
1013    see Objects/stringlib/localeutil.h */
1014 PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
1015     _PyUnicodeWriter *writer,
1016     Py_ssize_t n_buffer,
1017     PyObject *digits,
1018     Py_ssize_t d_pos,
1019     Py_ssize_t n_digits,
1020     Py_ssize_t min_width,
1021     const char *grouping,
1022     PyObject *thousands_sep,
1023     Py_UCS4 *maxchar);
1024 
1025 /* === Characters Type APIs =============================================== */
1026 
1027 /* Helper array used by Py_UNICODE_ISSPACE(). */
1028 
1029 PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
1030 
1031 /* These should not be used directly. Use the Py_UNICODE_IS* and
1032    Py_UNICODE_TO* macros instead.
1033 
1034    These APIs are implemented in Objects/unicodectype.c.
1035 
1036 */
1037 
1038 PyAPI_FUNC(int) _PyUnicode_IsLowercase(
1039     Py_UCS4 ch       /* Unicode character */
1040     );
1041 
1042 PyAPI_FUNC(int) _PyUnicode_IsUppercase(
1043     Py_UCS4 ch       /* Unicode character */
1044     );
1045 
1046 PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
1047     Py_UCS4 ch       /* Unicode character */
1048     );
1049 
1050 PyAPI_FUNC(int) _PyUnicode_IsXidStart(
1051     Py_UCS4 ch       /* Unicode character */
1052     );
1053 
1054 PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
1055     Py_UCS4 ch       /* Unicode character */
1056     );
1057 
1058 PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
1059     const Py_UCS4 ch         /* Unicode character */
1060     );
1061 
1062 PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
1063     const Py_UCS4 ch         /* Unicode character */
1064     );
1065 
1066 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
1067     Py_UCS4 ch       /* Unicode character */
1068     );
1069 
1070 /* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
1071     Py_UCS4 ch       /* Unicode character */
1072     );
1073 
1074 Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
1075     Py_UCS4 ch       /* Unicode character */
1076     );
1077 
1078 PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
1079     Py_UCS4 ch,       /* Unicode character */
1080     Py_UCS4 *res
1081     );
1082 
1083 PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
1084     Py_UCS4 ch,       /* Unicode character */
1085     Py_UCS4 *res
1086     );
1087 
1088 PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
1089     Py_UCS4 ch,       /* Unicode character */
1090     Py_UCS4 *res
1091     );
1092 
1093 PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
1094     Py_UCS4 ch,       /* Unicode character */
1095     Py_UCS4 *res
1096     );
1097 
1098 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
1099     Py_UCS4 ch         /* Unicode character */
1100     );
1101 
1102 PyAPI_FUNC(int) _PyUnicode_IsCased(
1103     Py_UCS4 ch         /* Unicode character */
1104     );
1105 
1106 PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
1107     Py_UCS4 ch       /* Unicode character */
1108     );
1109 
1110 PyAPI_FUNC(int) _PyUnicode_ToDigit(
1111     Py_UCS4 ch       /* Unicode character */
1112     );
1113 
1114 PyAPI_FUNC(double) _PyUnicode_ToNumeric(
1115     Py_UCS4 ch       /* Unicode character */
1116     );
1117 
1118 PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
1119     Py_UCS4 ch       /* Unicode character */
1120     );
1121 
1122 PyAPI_FUNC(int) _PyUnicode_IsDigit(
1123     Py_UCS4 ch       /* Unicode character */
1124     );
1125 
1126 PyAPI_FUNC(int) _PyUnicode_IsNumeric(
1127     Py_UCS4 ch       /* Unicode character */
1128     );
1129 
1130 PyAPI_FUNC(int) _PyUnicode_IsPrintable(
1131     Py_UCS4 ch       /* Unicode character */
1132     );
1133 
1134 PyAPI_FUNC(int) _PyUnicode_IsAlpha(
1135     Py_UCS4 ch       /* Unicode character */
1136     );
1137 
1138 PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
1139 
1140 /* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
1141 PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
1142 
1143 /* Fast equality check when the inputs are known to be exact unicode types
1144    and where the hash values are equal (i.e. a very probable match) */
1145 PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
1146 
1147 /* Equality check. Returns -1 on failure. */
1148 PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
1149 
1150 PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
1151 PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
1152 
1153 PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
1154