1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <[email protected]>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h"      // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
45 #include "pycore_bytesobject.h"   // _PyBytes_Repeat()
46 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47 #include "pycore_format.h"        // F_LJUST
48 #include "pycore_initconfig.h"    // _PyStatus_OK()
49 #include "pycore_interp.h"        // PyInterpreterState.fs_codec
50 #include "pycore_long.h"          // _PyLong_FormatWriter()
51 #include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52 #include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
53 #include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
54 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
55 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57 #include "stringlib/eq.h"         // unicode_eq()
58 
59 #ifdef MS_WINDOWS
60 #include <windows.h>
61 #endif
62 
63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64 #  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
65 #endif
66 
67 /* Uncomment to display statistics on interned strings at exit
68    in _PyUnicode_ClearInterned(). */
69 /* #define INTERNED_STATS 1 */
70 
71 
72 /*[clinic input]
73 class str "PyObject *" "&PyUnicode_Type"
74 [clinic start generated code]*/
75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76 
77 /*[python input]
78 class Py_UCS4_converter(CConverter):
79     type = 'Py_UCS4'
80     converter = 'convert_uc'
81 
82     def converter_init(self):
83         if self.default is not unspecified:
84             self.c_default = ascii(self.default)
85             if len(self.c_default) > 4 or self.c_default[0] != "'":
86                 self.c_default = hex(ord(self.default))
87 
88 [python start generated code]*/
89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90 
91 /* --- Globals ------------------------------------------------------------
92 
93 NOTE: In the interpreter's initialization phase, some globals are currently
94       initialized dynamically as needed. In the process Unicode objects may
95       be created before the Unicode type is ready.
96 
97 */
98 
99 
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103 
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107 
108 #ifdef Py_DEBUG
109 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113 
114 #define _PyUnicode_UTF8(op)                             \
115     (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op)                              \
117     (assert(_PyUnicode_CHECK(op)),                      \
118      assert(PyUnicode_IS_READY(op)),                    \
119      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
120          ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
121          _PyUnicode_UTF8(op))
122 #define _PyUnicode_UTF8_LENGTH(op)                      \
123     (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124 #define PyUnicode_UTF8_LENGTH(op)                       \
125     (assert(_PyUnicode_CHECK(op)),                      \
126      assert(PyUnicode_IS_READY(op)),                    \
127      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
128          _PyASCIIObject_CAST(op)->length :              \
129          _PyUnicode_UTF8_LENGTH(op))
130 #define _PyUnicode_WSTR(op)                             \
131     (_PyASCIIObject_CAST(op)->wstr)
132 
133 /* Don't use deprecated macro of unicodeobject.h */
134 #undef PyUnicode_WSTR_LENGTH
135 #define PyUnicode_WSTR_LENGTH(op) \
136     (PyUnicode_IS_COMPACT_ASCII(op) ?                   \
137      _PyASCIIObject_CAST(op)->length :                  \
138      _PyCompactUnicodeObject_CAST(op)->wstr_length)
139 #define _PyUnicode_WSTR_LENGTH(op)                      \
140     (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141 #define _PyUnicode_LENGTH(op)                           \
142     (_PyASCIIObject_CAST(op)->length)
143 #define _PyUnicode_STATE(op)                            \
144     (_PyASCIIObject_CAST(op)->state)
145 #define _PyUnicode_HASH(op)                             \
146     (_PyASCIIObject_CAST(op)->hash)
147 #define _PyUnicode_KIND(op)                             \
148     (assert(_PyUnicode_CHECK(op)),                      \
149      _PyASCIIObject_CAST(op)->state.kind)
150 #define _PyUnicode_GET_LENGTH(op)                       \
151     (assert(_PyUnicode_CHECK(op)),                      \
152      _PyASCIIObject_CAST(op)->length)
153 #define _PyUnicode_DATA_ANY(op)                         \
154     (_PyUnicodeObject_CAST(op)->data.any)
155 
156 #undef PyUnicode_READY
157 #define PyUnicode_READY(op)                             \
158     (assert(_PyUnicode_CHECK(op)),                      \
159      (PyUnicode_IS_READY(op) ?                          \
160       0 :                                               \
161       _PyUnicode_Ready(op)))
162 
163 #define _PyUnicode_SHARE_UTF8(op)                       \
164     (assert(_PyUnicode_CHECK(op)),                      \
165      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
166      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167 #define _PyUnicode_SHARE_WSTR(op)                       \
168     (assert(_PyUnicode_CHECK(op)),                      \
169      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170 
171 /* true if the Unicode object has an allocated UTF-8 memory block
172    (not shared with other data) */
173 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
174     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
175       && _PyUnicode_UTF8(op)                            \
176       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177 
178 /* true if the Unicode object has an allocated wstr memory block
179    (not shared with other data) */
180 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
181     ((_PyUnicode_WSTR(op) &&                            \
182       (!PyUnicode_IS_READY(op) ||                       \
183        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184 
185 /* Generic helper macro to convert characters of different types.
186    from_type and to_type have to be valid type names, begin and end
187    are pointers to the source characters which should be of type
188    "from_type *".  to is a pointer of type "to_type *" and points to the
189    buffer where the result characters are written to. */
190 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191     do {                                                \
192         to_type *_to = (to_type *)(to);                 \
193         const from_type *_iter = (const from_type *)(begin);\
194         const from_type *_end = (const from_type *)(end);\
195         Py_ssize_t n = (_end) - (_iter);                \
196         const from_type *_unrolled_end =                \
197             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
198         while (_iter < (_unrolled_end)) {               \
199             _to[0] = (to_type) _iter[0];                \
200             _to[1] = (to_type) _iter[1];                \
201             _to[2] = (to_type) _iter[2];                \
202             _to[3] = (to_type) _iter[3];                \
203             _iter += 4; _to += 4;                       \
204         }                                               \
205         while (_iter < (_end))                          \
206             *_to++ = (to_type) *_iter++;                \
207     } while (0)
208 
209 #define LATIN1(ch)  \
210     (ch < 128 \
211      ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212      : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213 
214 #ifdef MS_WINDOWS
215    /* On Windows, overallocate by 50% is the best factor */
216 #  define OVERALLOCATE_FACTOR 2
217 #else
218    /* On Linux, overallocate by 25% is the best factor */
219 #  define OVERALLOCATE_FACTOR 4
220 #endif
221 
222 /* This dictionary holds all interned unicode strings.  Note that references
223    to strings in this dictionary are *not* counted in the string's ob_refcnt.
224    When the interned string reaches a refcnt of 0 the string deallocation
225    function will delete the reference from this dictionary.
226 
227    Another way to look at this is that to say that the actual reference
228    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
229 */
230 static PyObject *interned = NULL;
231 
232 /* Forward declaration */
233 static inline int
234 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235 static inline void
236 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237 static PyObject *
238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239                     const char *errors);
240 static PyObject *
241 unicode_decode_utf8(const char *s, Py_ssize_t size,
242                     _Py_error_handler error_handler, const char *errors,
243                     Py_ssize_t *consumed);
244 #ifdef Py_DEBUG
245 static inline int unicode_is_finalizing(void);
246 static int unicode_is_singleton(PyObject *unicode);
247 #endif
248 
249 
250 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)251 static inline PyObject* unicode_get_empty(void)
252 {
253     _Py_DECLARE_STR(empty, "");
254     return &_Py_STR(empty);
255 }
256 
257 
258 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)259 static inline PyObject* unicode_new_empty(void)
260 {
261     PyObject *empty = unicode_get_empty();
262     Py_INCREF(empty);
263     return empty;
264 }
265 
266 #define _Py_RETURN_UNICODE_EMPTY()   \
267     do {                             \
268         return unicode_new_empty();  \
269     } while (0)
270 
271 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273              Py_ssize_t start, Py_ssize_t length)
274 {
275     assert(0 <= start);
276     assert(kind != PyUnicode_WCHAR_KIND);
277     switch (kind) {
278     case PyUnicode_1BYTE_KIND: {
279         assert(value <= 0xff);
280         Py_UCS1 ch = (unsigned char)value;
281         Py_UCS1 *to = (Py_UCS1 *)data + start;
282         memset(to, ch, length);
283         break;
284     }
285     case PyUnicode_2BYTE_KIND: {
286         assert(value <= 0xffff);
287         Py_UCS2 ch = (Py_UCS2)value;
288         Py_UCS2 *to = (Py_UCS2 *)data + start;
289         const Py_UCS2 *end = to + length;
290         for (; to < end; ++to) *to = ch;
291         break;
292     }
293     case PyUnicode_4BYTE_KIND: {
294         assert(value <= MAX_UNICODE);
295         Py_UCS4 ch = value;
296         Py_UCS4 * to = (Py_UCS4 *)data + start;
297         const Py_UCS4 *end = to + length;
298         for (; to < end; ++to) *to = ch;
299         break;
300     }
301     default: Py_UNREACHABLE();
302     }
303 }
304 
305 
306 /* Fast detection of the most frequent whitespace characters */
307 const unsigned char _Py_ascii_whitespace[] = {
308     0, 0, 0, 0, 0, 0, 0, 0,
309 /*     case 0x0009: * CHARACTER TABULATION */
310 /*     case 0x000A: * LINE FEED */
311 /*     case 0x000B: * LINE TABULATION */
312 /*     case 0x000C: * FORM FEED */
313 /*     case 0x000D: * CARRIAGE RETURN */
314     0, 1, 1, 1, 1, 1, 0, 0,
315     0, 0, 0, 0, 0, 0, 0, 0,
316 /*     case 0x001C: * FILE SEPARATOR */
317 /*     case 0x001D: * GROUP SEPARATOR */
318 /*     case 0x001E: * RECORD SEPARATOR */
319 /*     case 0x001F: * UNIT SEPARATOR */
320     0, 0, 0, 0, 1, 1, 1, 1,
321 /*     case 0x0020: * SPACE */
322     1, 0, 0, 0, 0, 0, 0, 0,
323     0, 0, 0, 0, 0, 0, 0, 0,
324     0, 0, 0, 0, 0, 0, 0, 0,
325     0, 0, 0, 0, 0, 0, 0, 0,
326 
327     0, 0, 0, 0, 0, 0, 0, 0,
328     0, 0, 0, 0, 0, 0, 0, 0,
329     0, 0, 0, 0, 0, 0, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331     0, 0, 0, 0, 0, 0, 0, 0,
332     0, 0, 0, 0, 0, 0, 0, 0,
333     0, 0, 0, 0, 0, 0, 0, 0,
334     0, 0, 0, 0, 0, 0, 0, 0
335 };
336 
337 /* forward */
338 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339 static PyObject* get_latin1_char(unsigned char ch);
340 static int unicode_modifiable(PyObject *unicode);
341 
342 
343 static PyObject *
344 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345 static PyObject *
346 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347 static PyObject *
348 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349 
350 static PyObject *
351 unicode_encode_call_errorhandler(const char *errors,
352        PyObject **errorHandler,const char *encoding, const char *reason,
353        PyObject *unicode, PyObject **exceptionObject,
354        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355 
356 static void
357 raise_encode_exception(PyObject **exceptionObject,
358                        const char *encoding,
359                        PyObject *unicode,
360                        Py_ssize_t startpos, Py_ssize_t endpos,
361                        const char *reason);
362 
363 /* Same for linebreaks */
364 static const unsigned char ascii_linebreak[] = {
365     0, 0, 0, 0, 0, 0, 0, 0,
366 /*         0x000A, * LINE FEED */
367 /*         0x000B, * LINE TABULATION */
368 /*         0x000C, * FORM FEED */
369 /*         0x000D, * CARRIAGE RETURN */
370     0, 0, 1, 1, 1, 1, 0, 0,
371     0, 0, 0, 0, 0, 0, 0, 0,
372 /*         0x001C, * FILE SEPARATOR */
373 /*         0x001D, * GROUP SEPARATOR */
374 /*         0x001E, * RECORD SEPARATOR */
375     0, 0, 0, 0, 1, 1, 1, 0,
376     0, 0, 0, 0, 0, 0, 0, 0,
377     0, 0, 0, 0, 0, 0, 0, 0,
378     0, 0, 0, 0, 0, 0, 0, 0,
379     0, 0, 0, 0, 0, 0, 0, 0,
380 
381     0, 0, 0, 0, 0, 0, 0, 0,
382     0, 0, 0, 0, 0, 0, 0, 0,
383     0, 0, 0, 0, 0, 0, 0, 0,
384     0, 0, 0, 0, 0, 0, 0, 0,
385     0, 0, 0, 0, 0, 0, 0, 0,
386     0, 0, 0, 0, 0, 0, 0, 0,
387     0, 0, 0, 0, 0, 0, 0, 0,
388     0, 0, 0, 0, 0, 0, 0, 0
389 };
390 
391 static int convert_uc(PyObject *obj, void *addr);
392 
393 struct encoding_map;
394 #include "clinic/unicodeobject.c.h"
395 
396 _Py_error_handler
_Py_GetErrorHandler(const char * errors)397 _Py_GetErrorHandler(const char *errors)
398 {
399     if (errors == NULL || strcmp(errors, "strict") == 0) {
400         return _Py_ERROR_STRICT;
401     }
402     if (strcmp(errors, "surrogateescape") == 0) {
403         return _Py_ERROR_SURROGATEESCAPE;
404     }
405     if (strcmp(errors, "replace") == 0) {
406         return _Py_ERROR_REPLACE;
407     }
408     if (strcmp(errors, "ignore") == 0) {
409         return _Py_ERROR_IGNORE;
410     }
411     if (strcmp(errors, "backslashreplace") == 0) {
412         return _Py_ERROR_BACKSLASHREPLACE;
413     }
414     if (strcmp(errors, "surrogatepass") == 0) {
415         return _Py_ERROR_SURROGATEPASS;
416     }
417     if (strcmp(errors, "xmlcharrefreplace") == 0) {
418         return _Py_ERROR_XMLCHARREFREPLACE;
419     }
420     return _Py_ERROR_OTHER;
421 }
422 
423 
424 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)425 get_error_handler_wide(const wchar_t *errors)
426 {
427     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428         return _Py_ERROR_STRICT;
429     }
430     if (wcscmp(errors, L"surrogateescape") == 0) {
431         return _Py_ERROR_SURROGATEESCAPE;
432     }
433     if (wcscmp(errors, L"replace") == 0) {
434         return _Py_ERROR_REPLACE;
435     }
436     if (wcscmp(errors, L"ignore") == 0) {
437         return _Py_ERROR_IGNORE;
438     }
439     if (wcscmp(errors, L"backslashreplace") == 0) {
440         return _Py_ERROR_BACKSLASHREPLACE;
441     }
442     if (wcscmp(errors, L"surrogatepass") == 0) {
443         return _Py_ERROR_SURROGATEPASS;
444     }
445     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446         return _Py_ERROR_XMLCHARREFREPLACE;
447     }
448     return _Py_ERROR_OTHER;
449 }
450 
451 
452 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)453 unicode_check_encoding_errors(const char *encoding, const char *errors)
454 {
455     if (encoding == NULL && errors == NULL) {
456         return 0;
457     }
458 
459     PyInterpreterState *interp = _PyInterpreterState_GET();
460 #ifndef Py_DEBUG
461     /* In release mode, only check in development mode (-X dev) */
462     if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463         return 0;
464     }
465 #else
466     /* Always check in debug mode */
467 #endif
468 
469     /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470        codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471     if (!interp->unicode.fs_codec.encoding) {
472         return 0;
473     }
474 
475     /* Disable checks during Python finalization. For example, it allows to
476        call _PyObject_Dump() during finalization for debugging purpose. */
477     if (interp->finalizing) {
478         return 0;
479     }
480 
481     if (encoding != NULL) {
482         PyObject *handler = _PyCodec_Lookup(encoding);
483         if (handler == NULL) {
484             return -1;
485         }
486         Py_DECREF(handler);
487     }
488 
489     if (errors != NULL) {
490         PyObject *handler = PyCodec_LookupError(errors);
491         if (handler == NULL) {
492             return -1;
493         }
494         Py_DECREF(handler);
495     }
496     return 0;
497 }
498 
499 
500 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)501 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
502 {
503 #define CHECK(expr) \
504     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505 
506     assert(op != NULL);
507     CHECK(PyUnicode_Check(op));
508 
509     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510     unsigned int kind = ascii->state.kind;
511 
512     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513         CHECK(kind == PyUnicode_1BYTE_KIND);
514         CHECK(ascii->state.ready == 1);
515     }
516     else {
517         PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518         void *data;
519 
520         if (ascii->state.compact == 1) {
521             data = compact + 1;
522             CHECK(kind == PyUnicode_1BYTE_KIND
523                                  || kind == PyUnicode_2BYTE_KIND
524                                  || kind == PyUnicode_4BYTE_KIND);
525             CHECK(ascii->state.ascii == 0);
526             CHECK(ascii->state.ready == 1);
527             CHECK(compact->utf8 != data);
528         }
529         else {
530             PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531 
532             data = unicode->data.any;
533             if (kind == PyUnicode_WCHAR_KIND) {
534                 CHECK(ascii->length == 0);
535                 CHECK(ascii->hash == -1);
536                 CHECK(ascii->state.compact == 0);
537                 CHECK(ascii->state.ascii == 0);
538                 CHECK(ascii->state.ready == 0);
539                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540                 CHECK(ascii->wstr != NULL);
541                 CHECK(data == NULL);
542                 CHECK(compact->utf8 == NULL);
543             }
544             else {
545                 CHECK(kind == PyUnicode_1BYTE_KIND
546                                      || kind == PyUnicode_2BYTE_KIND
547                                      || kind == PyUnicode_4BYTE_KIND);
548                 CHECK(ascii->state.compact == 0);
549                 CHECK(ascii->state.ready == 1);
550                 CHECK(data != NULL);
551                 if (ascii->state.ascii) {
552                     CHECK(compact->utf8 == data);
553                     CHECK(compact->utf8_length == ascii->length);
554                 }
555                 else
556                     CHECK(compact->utf8 != data);
557             }
558         }
559         if (kind != PyUnicode_WCHAR_KIND) {
560             if (
561 #if SIZEOF_WCHAR_T == 2
562                 kind == PyUnicode_2BYTE_KIND
563 #else
564                 kind == PyUnicode_4BYTE_KIND
565 #endif
566                )
567             {
568                 CHECK(ascii->wstr == data);
569                 CHECK(compact->wstr_length == ascii->length);
570             } else
571                 CHECK(ascii->wstr != data);
572         }
573 
574         if (compact->utf8 == NULL)
575             CHECK(compact->utf8_length == 0);
576         if (ascii->wstr == NULL)
577             CHECK(compact->wstr_length == 0);
578     }
579 
580     /* check that the best kind is used: O(n) operation */
581     if (check_content && kind != PyUnicode_WCHAR_KIND) {
582         Py_ssize_t i;
583         Py_UCS4 maxchar = 0;
584         const void *data;
585         Py_UCS4 ch;
586 
587         data = PyUnicode_DATA(ascii);
588         for (i=0; i < ascii->length; i++)
589         {
590             ch = PyUnicode_READ(kind, data, i);
591             if (ch > maxchar)
592                 maxchar = ch;
593         }
594         if (kind == PyUnicode_1BYTE_KIND) {
595             if (ascii->state.ascii == 0) {
596                 CHECK(maxchar >= 128);
597                 CHECK(maxchar <= 255);
598             }
599             else
600                 CHECK(maxchar < 128);
601         }
602         else if (kind == PyUnicode_2BYTE_KIND) {
603             CHECK(maxchar >= 0x100);
604             CHECK(maxchar <= 0xFFFF);
605         }
606         else {
607             CHECK(maxchar >= 0x10000);
608             CHECK(maxchar <= MAX_UNICODE);
609         }
610         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611     }
612     return 1;
613 
614 #undef CHECK
615 }
616 
617 
618 static PyObject*
unicode_result_wchar(PyObject * unicode)619 unicode_result_wchar(PyObject *unicode)
620 {
621 #ifndef Py_DEBUG
622     Py_ssize_t len;
623 
624     len = _PyUnicode_WSTR_LENGTH(unicode);
625     if (len == 0) {
626         Py_DECREF(unicode);
627         _Py_RETURN_UNICODE_EMPTY();
628     }
629 
630     if (len == 1) {
631         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632         if ((Py_UCS4)ch < 256) {
633             Py_DECREF(unicode);
634             return get_latin1_char((unsigned char)ch);
635         }
636     }
637 
638     if (_PyUnicode_Ready(unicode) < 0) {
639         Py_DECREF(unicode);
640         return NULL;
641     }
642 #else
643     assert(Py_REFCNT(unicode) == 1);
644 
645     /* don't make the result ready in debug mode to ensure that the caller
646        makes the string ready before using it */
647     assert(_PyUnicode_CheckConsistency(unicode, 1));
648 #endif
649     return unicode;
650 }
651 
652 static PyObject*
unicode_result_ready(PyObject * unicode)653 unicode_result_ready(PyObject *unicode)
654 {
655     Py_ssize_t length;
656 
657     length = PyUnicode_GET_LENGTH(unicode);
658     if (length == 0) {
659         PyObject *empty = unicode_get_empty();
660         if (unicode != empty) {
661             Py_DECREF(unicode);
662             Py_INCREF(empty);
663         }
664         return empty;
665     }
666 
667     if (length == 1) {
668         int kind = PyUnicode_KIND(unicode);
669         if (kind == PyUnicode_1BYTE_KIND) {
670             const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671             Py_UCS1 ch = data[0];
672             PyObject *latin1_char = LATIN1(ch);
673             if (unicode != latin1_char) {
674                 Py_INCREF(latin1_char);
675                 Py_DECREF(unicode);
676             }
677             return latin1_char;
678         }
679     }
680 
681     assert(_PyUnicode_CheckConsistency(unicode, 1));
682     return unicode;
683 }
684 
685 static PyObject*
unicode_result(PyObject * unicode)686 unicode_result(PyObject *unicode)
687 {
688     assert(_PyUnicode_CHECK(unicode));
689     if (PyUnicode_IS_READY(unicode))
690         return unicode_result_ready(unicode);
691     else
692         return unicode_result_wchar(unicode);
693 }
694 
695 static PyObject*
unicode_result_unchanged(PyObject * unicode)696 unicode_result_unchanged(PyObject *unicode)
697 {
698     if (PyUnicode_CheckExact(unicode)) {
699         if (PyUnicode_READY(unicode) == -1)
700             return NULL;
701         Py_INCREF(unicode);
702         return unicode;
703     }
704     else
705         /* Subtype -- return genuine unicode string with the same value. */
706         return _PyUnicode_Copy(unicode);
707 }
708 
709 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710    ASCII, Latin1, UTF-8, etc. */
711 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712 backslashreplace(_PyBytesWriter *writer, char *str,
713                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714 {
715     Py_ssize_t size, i;
716     Py_UCS4 ch;
717     enum PyUnicode_Kind kind;
718     const void *data;
719 
720     assert(PyUnicode_IS_READY(unicode));
721     kind = PyUnicode_KIND(unicode);
722     data = PyUnicode_DATA(unicode);
723 
724     size = 0;
725     /* determine replacement size */
726     for (i = collstart; i < collend; ++i) {
727         Py_ssize_t incr;
728 
729         ch = PyUnicode_READ(kind, data, i);
730         if (ch < 0x100)
731             incr = 2+2;
732         else if (ch < 0x10000)
733             incr = 2+4;
734         else {
735             assert(ch <= MAX_UNICODE);
736             incr = 2+8;
737         }
738         if (size > PY_SSIZE_T_MAX - incr) {
739             PyErr_SetString(PyExc_OverflowError,
740                             "encoded result is too long for a Python string");
741             return NULL;
742         }
743         size += incr;
744     }
745 
746     str = _PyBytesWriter_Prepare(writer, str, size);
747     if (str == NULL)
748         return NULL;
749 
750     /* generate replacement */
751     for (i = collstart; i < collend; ++i) {
752         ch = PyUnicode_READ(kind, data, i);
753         *str++ = '\\';
754         if (ch >= 0x00010000) {
755             *str++ = 'U';
756             *str++ = Py_hexdigits[(ch>>28)&0xf];
757             *str++ = Py_hexdigits[(ch>>24)&0xf];
758             *str++ = Py_hexdigits[(ch>>20)&0xf];
759             *str++ = Py_hexdigits[(ch>>16)&0xf];
760             *str++ = Py_hexdigits[(ch>>12)&0xf];
761             *str++ = Py_hexdigits[(ch>>8)&0xf];
762         }
763         else if (ch >= 0x100) {
764             *str++ = 'u';
765             *str++ = Py_hexdigits[(ch>>12)&0xf];
766             *str++ = Py_hexdigits[(ch>>8)&0xf];
767         }
768         else
769             *str++ = 'x';
770         *str++ = Py_hexdigits[(ch>>4)&0xf];
771         *str++ = Py_hexdigits[ch&0xf];
772     }
773     return str;
774 }
775 
776 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777    ASCII, Latin1, UTF-8, etc. */
778 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781 {
782     Py_ssize_t size, i;
783     Py_UCS4 ch;
784     enum PyUnicode_Kind kind;
785     const void *data;
786 
787     assert(PyUnicode_IS_READY(unicode));
788     kind = PyUnicode_KIND(unicode);
789     data = PyUnicode_DATA(unicode);
790 
791     size = 0;
792     /* determine replacement size */
793     for (i = collstart; i < collend; ++i) {
794         Py_ssize_t incr;
795 
796         ch = PyUnicode_READ(kind, data, i);
797         if (ch < 10)
798             incr = 2+1+1;
799         else if (ch < 100)
800             incr = 2+2+1;
801         else if (ch < 1000)
802             incr = 2+3+1;
803         else if (ch < 10000)
804             incr = 2+4+1;
805         else if (ch < 100000)
806             incr = 2+5+1;
807         else if (ch < 1000000)
808             incr = 2+6+1;
809         else {
810             assert(ch <= MAX_UNICODE);
811             incr = 2+7+1;
812         }
813         if (size > PY_SSIZE_T_MAX - incr) {
814             PyErr_SetString(PyExc_OverflowError,
815                             "encoded result is too long for a Python string");
816             return NULL;
817         }
818         size += incr;
819     }
820 
821     str = _PyBytesWriter_Prepare(writer, str, size);
822     if (str == NULL)
823         return NULL;
824 
825     /* generate replacement */
826     for (i = collstart; i < collend; ++i) {
827         size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828         if (size < 0) {
829             return NULL;
830         }
831         str += size;
832     }
833     return str;
834 }
835 
836 /* --- Bloom Filters ----------------------------------------------------- */
837 
838 /* stuff to implement simple "bloom filters" for Unicode characters.
839    to keep things simple, we use a single bitmask, using the least 5
840    bits from each unicode characters as the bit index. */
841 
842 /* the linebreak mask is set up by _PyUnicode_Init() below */
843 
844 #if LONG_BIT >= 128
845 #define BLOOM_WIDTH 128
846 #elif LONG_BIT >= 64
847 #define BLOOM_WIDTH 64
848 #elif LONG_BIT >= 32
849 #define BLOOM_WIDTH 32
850 #else
851 #error "LONG_BIT is smaller than 32"
852 #endif
853 
854 #define BLOOM_MASK unsigned long
855 
856 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857 
858 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859 
860 #define BLOOM_LINEBREAK(ch)                                             \
861     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
862      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863 
864 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866 {
867 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
868     do {                                               \
869         TYPE *data = (TYPE *)PTR;                      \
870         TYPE *end = data + LEN;                        \
871         Py_UCS4 ch;                                    \
872         for (; data != end; data++) {                  \
873             ch = *data;                                \
874             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875         }                                              \
876         break;                                         \
877     } while (0)
878 
879     /* calculate simple bloom-style bitmask for a given unicode string */
880 
881     BLOOM_MASK mask;
882 
883     mask = 0;
884     switch (kind) {
885     case PyUnicode_1BYTE_KIND:
886         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887         break;
888     case PyUnicode_2BYTE_KIND:
889         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890         break;
891     case PyUnicode_4BYTE_KIND:
892         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893         break;
894     default:
895         Py_UNREACHABLE();
896     }
897     return mask;
898 
899 #undef BLOOM_UPDATE
900 }
901 
902 static int
ensure_unicode(PyObject * obj)903 ensure_unicode(PyObject *obj)
904 {
905     if (!PyUnicode_Check(obj)) {
906         PyErr_Format(PyExc_TypeError,
907                      "must be str, not %.100s",
908                      Py_TYPE(obj)->tp_name);
909         return -1;
910     }
911     return PyUnicode_READY(obj);
912 }
913 
914 /* Compilation of templated routines */
915 
916 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
917 
918 #include "stringlib/asciilib.h"
919 #include "stringlib/fastsearch.h"
920 #include "stringlib/partition.h"
921 #include "stringlib/split.h"
922 #include "stringlib/count.h"
923 #include "stringlib/find.h"
924 #include "stringlib/find_max_char.h"
925 #include "stringlib/undef.h"
926 
927 #include "stringlib/ucs1lib.h"
928 #include "stringlib/fastsearch.h"
929 #include "stringlib/partition.h"
930 #include "stringlib/split.h"
931 #include "stringlib/count.h"
932 #include "stringlib/find.h"
933 #include "stringlib/replace.h"
934 #include "stringlib/find_max_char.h"
935 #include "stringlib/undef.h"
936 
937 #include "stringlib/ucs2lib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/replace.h"
944 #include "stringlib/find_max_char.h"
945 #include "stringlib/undef.h"
946 
947 #include "stringlib/ucs4lib.h"
948 #include "stringlib/fastsearch.h"
949 #include "stringlib/partition.h"
950 #include "stringlib/split.h"
951 #include "stringlib/count.h"
952 #include "stringlib/find.h"
953 #include "stringlib/replace.h"
954 #include "stringlib/find_max_char.h"
955 #include "stringlib/undef.h"
956 
957 _Py_COMP_DIAG_PUSH
958 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
959 #include "stringlib/unicodedefs.h"
960 #include "stringlib/fastsearch.h"
961 #include "stringlib/count.h"
962 #include "stringlib/find.h"
963 #include "stringlib/undef.h"
964 _Py_COMP_DIAG_POP
965 
966 #undef STRINGLIB_GET_EMPTY
967 
968 /* --- Unicode Object ----------------------------------------------------- */
969 
970 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971 findchar(const void *s, int kind,
972          Py_ssize_t size, Py_UCS4 ch,
973          int direction)
974 {
975     switch (kind) {
976     case PyUnicode_1BYTE_KIND:
977         if ((Py_UCS1) ch != ch)
978             return -1;
979         if (direction > 0)
980             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981         else
982             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983     case PyUnicode_2BYTE_KIND:
984         if ((Py_UCS2) ch != ch)
985             return -1;
986         if (direction > 0)
987             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988         else
989             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990     case PyUnicode_4BYTE_KIND:
991         if (direction > 0)
992             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993         else
994             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995     default:
996         Py_UNREACHABLE();
997     }
998 }
999 
1000 #ifdef Py_DEBUG
1001 /* Fill the data of a Unicode string with invalid characters to detect bugs
1002    earlier.
1003 
1004    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006    invalid character in Unicode 6.0. */
1007 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009 {
1010     int kind = PyUnicode_KIND(unicode);
1011     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013     if (length <= old_length)
1014         return;
1015     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016 }
1017 #endif
1018 
1019 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1020 resize_compact(PyObject *unicode, Py_ssize_t length)
1021 {
1022     Py_ssize_t char_size;
1023     Py_ssize_t struct_size;
1024     Py_ssize_t new_size;
1025     int share_wstr;
1026     PyObject *new_unicode;
1027 #ifdef Py_DEBUG
1028     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029 #endif
1030 
1031     assert(unicode_modifiable(unicode));
1032     assert(PyUnicode_IS_READY(unicode));
1033     assert(PyUnicode_IS_COMPACT(unicode));
1034 
1035     char_size = PyUnicode_KIND(unicode);
1036     if (PyUnicode_IS_ASCII(unicode))
1037         struct_size = sizeof(PyASCIIObject);
1038     else
1039         struct_size = sizeof(PyCompactUnicodeObject);
1040     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 
1042     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043         PyErr_NoMemory();
1044         return NULL;
1045     }
1046     new_size = (struct_size + (length + 1) * char_size);
1047 
1048     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049         PyObject_Free(_PyUnicode_UTF8(unicode));
1050         _PyUnicode_UTF8(unicode) = NULL;
1051         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052     }
1053 #ifdef Py_REF_DEBUG
1054     _Py_RefTotal--;
1055 #endif
1056 #ifdef Py_TRACE_REFS
1057     _Py_ForgetReference(unicode);
1058 #endif
1059 
1060     new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061     if (new_unicode == NULL) {
1062         _Py_NewReference(unicode);
1063         PyErr_NoMemory();
1064         return NULL;
1065     }
1066     unicode = new_unicode;
1067     _Py_NewReference(unicode);
1068 
1069     _PyUnicode_LENGTH(unicode) = length;
1070     if (share_wstr) {
1071         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072         if (!PyUnicode_IS_ASCII(unicode))
1073             _PyUnicode_WSTR_LENGTH(unicode) = length;
1074     }
1075     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076         PyObject_Free(_PyUnicode_WSTR(unicode));
1077         _PyUnicode_WSTR(unicode) = NULL;
1078         if (!PyUnicode_IS_ASCII(unicode))
1079             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080     }
1081 #ifdef Py_DEBUG
1082     unicode_fill_invalid(unicode, old_length);
1083 #endif
1084     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085                     length, 0);
1086     assert(_PyUnicode_CheckConsistency(unicode, 0));
1087     return unicode;
1088 }
1089 
1090 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1091 resize_inplace(PyObject *unicode, Py_ssize_t length)
1092 {
1093     wchar_t *wstr;
1094     Py_ssize_t new_size;
1095     assert(!PyUnicode_IS_COMPACT(unicode));
1096     assert(Py_REFCNT(unicode) == 1);
1097 
1098     if (PyUnicode_IS_READY(unicode)) {
1099         Py_ssize_t char_size;
1100         int share_wstr, share_utf8;
1101         void *data;
1102 #ifdef Py_DEBUG
1103         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104 #endif
1105 
1106         data = _PyUnicode_DATA_ANY(unicode);
1107         char_size = PyUnicode_KIND(unicode);
1108         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110 
1111         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112             PyErr_NoMemory();
1113             return -1;
1114         }
1115         new_size = (length + 1) * char_size;
1116 
1117         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118         {
1119             PyObject_Free(_PyUnicode_UTF8(unicode));
1120             _PyUnicode_UTF8(unicode) = NULL;
1121             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122         }
1123 
1124         data = (PyObject *)PyObject_Realloc(data, new_size);
1125         if (data == NULL) {
1126             PyErr_NoMemory();
1127             return -1;
1128         }
1129         _PyUnicode_DATA_ANY(unicode) = data;
1130         if (share_wstr) {
1131             _PyUnicode_WSTR(unicode) = data;
1132             _PyUnicode_WSTR_LENGTH(unicode) = length;
1133         }
1134         if (share_utf8) {
1135             _PyUnicode_UTF8(unicode) = data;
1136             _PyUnicode_UTF8_LENGTH(unicode) = length;
1137         }
1138         _PyUnicode_LENGTH(unicode) = length;
1139         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140 #ifdef Py_DEBUG
1141         unicode_fill_invalid(unicode, old_length);
1142 #endif
1143         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144             assert(_PyUnicode_CheckConsistency(unicode, 0));
1145             return 0;
1146         }
1147     }
1148     assert(_PyUnicode_WSTR(unicode) != NULL);
1149 
1150     /* check for integer overflow */
1151     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152         PyErr_NoMemory();
1153         return -1;
1154     }
1155     new_size = sizeof(wchar_t) * (length + 1);
1156     wstr =  _PyUnicode_WSTR(unicode);
1157     wstr = PyObject_Realloc(wstr, new_size);
1158     if (!wstr) {
1159         PyErr_NoMemory();
1160         return -1;
1161     }
1162     _PyUnicode_WSTR(unicode) = wstr;
1163     _PyUnicode_WSTR(unicode)[length] = 0;
1164     _PyUnicode_WSTR_LENGTH(unicode) = length;
1165     assert(_PyUnicode_CheckConsistency(unicode, 0));
1166     return 0;
1167 }
1168 
1169 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1170 resize_copy(PyObject *unicode, Py_ssize_t length)
1171 {
1172     Py_ssize_t copy_length;
1173     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174         PyObject *copy;
1175 
1176         assert(PyUnicode_IS_READY(unicode));
1177 
1178         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179         if (copy == NULL)
1180             return NULL;
1181 
1182         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184         return copy;
1185     }
1186     else {
1187         PyObject *w;
1188 
1189         w = (PyObject*)_PyUnicode_New(length);
1190         if (w == NULL)
1191             return NULL;
1192         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193         copy_length = Py_MIN(copy_length, length);
1194         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195                   copy_length * sizeof(wchar_t));
1196         return w;
1197     }
1198 }
1199 
1200 /* We allocate one more byte to make sure the string is
1201    Ux0000 terminated; some code (e.g. new_identifier)
1202    relies on that.
1203 
1204    XXX This allocator could further be enhanced by assuring that the
1205    free list never reduces its size below 1.
1206 
1207 */
1208 
1209 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1210 _PyUnicode_New(Py_ssize_t length)
1211 {
1212     PyUnicodeObject *unicode;
1213     size_t new_size;
1214 
1215     /* Optimization for empty strings */
1216     if (length == 0) {
1217         return (PyUnicodeObject *)unicode_new_empty();
1218     }
1219 
1220     /* Ensure we won't overflow the size. */
1221     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222         return (PyUnicodeObject *)PyErr_NoMemory();
1223     }
1224     if (length < 0) {
1225         PyErr_SetString(PyExc_SystemError,
1226                         "Negative size passed to _PyUnicode_New");
1227         return NULL;
1228     }
1229 
1230     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231     if (unicode == NULL)
1232         return NULL;
1233     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234 
1235     _PyUnicode_WSTR_LENGTH(unicode) = length;
1236     _PyUnicode_HASH(unicode) = -1;
1237     _PyUnicode_STATE(unicode).interned = 0;
1238     _PyUnicode_STATE(unicode).kind = 0;
1239     _PyUnicode_STATE(unicode).compact = 0;
1240     _PyUnicode_STATE(unicode).ready = 0;
1241     _PyUnicode_STATE(unicode).ascii = 0;
1242     _PyUnicode_DATA_ANY(unicode) = NULL;
1243     _PyUnicode_LENGTH(unicode) = 0;
1244     _PyUnicode_UTF8(unicode) = NULL;
1245     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246 
1247     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248     if (!_PyUnicode_WSTR(unicode)) {
1249         Py_DECREF(unicode);
1250         PyErr_NoMemory();
1251         return NULL;
1252     }
1253 
1254     /* Initialize the first element to guard against cases where
1255      * the caller fails before initializing str -- unicode_resize()
1256      * reads str[0], and the Keep-Alive optimization can keep memory
1257      * allocated for str alive across a call to unicode_dealloc(unicode).
1258      * We don't want unicode_resize to read uninitialized memory in
1259      * that case.
1260      */
1261     _PyUnicode_WSTR(unicode)[0] = 0;
1262     _PyUnicode_WSTR(unicode)[length] = 0;
1263 
1264     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265     return unicode;
1266 }
1267 
1268 static const char*
unicode_kind_name(PyObject * unicode)1269 unicode_kind_name(PyObject *unicode)
1270 {
1271     /* don't check consistency: unicode_kind_name() is called from
1272        _PyUnicode_Dump() */
1273     if (!PyUnicode_IS_COMPACT(unicode))
1274     {
1275         if (!PyUnicode_IS_READY(unicode))
1276             return "wstr";
1277         switch (PyUnicode_KIND(unicode))
1278         {
1279         case PyUnicode_1BYTE_KIND:
1280             if (PyUnicode_IS_ASCII(unicode))
1281                 return "legacy ascii";
1282             else
1283                 return "legacy latin1";
1284         case PyUnicode_2BYTE_KIND:
1285             return "legacy UCS2";
1286         case PyUnicode_4BYTE_KIND:
1287             return "legacy UCS4";
1288         default:
1289             return "<legacy invalid kind>";
1290         }
1291     }
1292     assert(PyUnicode_IS_READY(unicode));
1293     switch (PyUnicode_KIND(unicode)) {
1294     case PyUnicode_1BYTE_KIND:
1295         if (PyUnicode_IS_ASCII(unicode))
1296             return "ascii";
1297         else
1298             return "latin1";
1299     case PyUnicode_2BYTE_KIND:
1300         return "UCS2";
1301     case PyUnicode_4BYTE_KIND:
1302         return "UCS4";
1303     default:
1304         return "<invalid compact kind>";
1305     }
1306 }
1307 
1308 #ifdef Py_DEBUG
1309 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1310 const char *_PyUnicode_utf8(void *unicode_raw){
1311     PyObject *unicode = _PyObject_CAST(unicode_raw);
1312     return PyUnicode_UTF8(unicode);
1313 }
1314 
_PyUnicode_compact_data(void * unicode_raw)1315 const void *_PyUnicode_compact_data(void *unicode_raw) {
1316     PyObject *unicode = _PyObject_CAST(unicode_raw);
1317     return _PyUnicode_COMPACT_DATA(unicode);
1318 }
_PyUnicode_data(void * unicode_raw)1319 const void *_PyUnicode_data(void *unicode_raw) {
1320     PyObject *unicode = _PyObject_CAST(unicode_raw);
1321     printf("obj %p\n", (void*)unicode);
1322     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324     printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325     printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327     return PyUnicode_DATA(unicode);
1328 }
1329 
1330 void
_PyUnicode_Dump(PyObject * op)1331 _PyUnicode_Dump(PyObject *op)
1332 {
1333     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334     PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335     PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336     const void *data;
1337 
1338     if (ascii->state.compact)
1339     {
1340         if (ascii->state.ascii)
1341             data = (ascii + 1);
1342         else
1343             data = (compact + 1);
1344     }
1345     else
1346         data = unicode->data.any;
1347     printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348 
1349     if (ascii->wstr == data)
1350         printf("shared ");
1351     printf("wstr=%p", (void *)ascii->wstr);
1352 
1353     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354         printf(" (%zu), ", compact->wstr_length);
1355         if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356             printf("shared ");
1357         }
1358         printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359     }
1360     printf(", data=%p\n", data);
1361 }
1362 #endif
1363 
1364 
1365 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367 {
1368     /* Optimization for empty strings */
1369     if (size == 0) {
1370         return unicode_new_empty();
1371     }
1372 
1373     PyObject *obj;
1374     PyCompactUnicodeObject *unicode;
1375     void *data;
1376     enum PyUnicode_Kind kind;
1377     int is_sharing, is_ascii;
1378     Py_ssize_t char_size;
1379     Py_ssize_t struct_size;
1380 
1381     is_ascii = 0;
1382     is_sharing = 0;
1383     struct_size = sizeof(PyCompactUnicodeObject);
1384     if (maxchar < 128) {
1385         kind = PyUnicode_1BYTE_KIND;
1386         char_size = 1;
1387         is_ascii = 1;
1388         struct_size = sizeof(PyASCIIObject);
1389     }
1390     else if (maxchar < 256) {
1391         kind = PyUnicode_1BYTE_KIND;
1392         char_size = 1;
1393     }
1394     else if (maxchar < 65536) {
1395         kind = PyUnicode_2BYTE_KIND;
1396         char_size = 2;
1397         if (sizeof(wchar_t) == 2)
1398             is_sharing = 1;
1399     }
1400     else {
1401         if (maxchar > MAX_UNICODE) {
1402             PyErr_SetString(PyExc_SystemError,
1403                             "invalid maximum character passed to PyUnicode_New");
1404             return NULL;
1405         }
1406         kind = PyUnicode_4BYTE_KIND;
1407         char_size = 4;
1408         if (sizeof(wchar_t) == 4)
1409             is_sharing = 1;
1410     }
1411 
1412     /* Ensure we won't overflow the size. */
1413     if (size < 0) {
1414         PyErr_SetString(PyExc_SystemError,
1415                         "Negative size passed to PyUnicode_New");
1416         return NULL;
1417     }
1418     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419         return PyErr_NoMemory();
1420 
1421     /* Duplicated allocation code from _PyObject_New() instead of a call to
1422      * PyObject_New() so we are able to allocate space for the object and
1423      * it's data buffer.
1424      */
1425     obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426     if (obj == NULL) {
1427         return PyErr_NoMemory();
1428     }
1429     _PyObject_Init(obj, &PyUnicode_Type);
1430 
1431     unicode = (PyCompactUnicodeObject *)obj;
1432     if (is_ascii)
1433         data = ((PyASCIIObject*)obj) + 1;
1434     else
1435         data = unicode + 1;
1436     _PyUnicode_LENGTH(unicode) = size;
1437     _PyUnicode_HASH(unicode) = -1;
1438     _PyUnicode_STATE(unicode).interned = 0;
1439     _PyUnicode_STATE(unicode).kind = kind;
1440     _PyUnicode_STATE(unicode).compact = 1;
1441     _PyUnicode_STATE(unicode).ready = 1;
1442     _PyUnicode_STATE(unicode).ascii = is_ascii;
1443     if (is_ascii) {
1444         ((char*)data)[size] = 0;
1445         _PyUnicode_WSTR(unicode) = NULL;
1446     }
1447     else if (kind == PyUnicode_1BYTE_KIND) {
1448         ((char*)data)[size] = 0;
1449         _PyUnicode_WSTR(unicode) = NULL;
1450         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451         unicode->utf8 = NULL;
1452         unicode->utf8_length = 0;
1453     }
1454     else {
1455         unicode->utf8 = NULL;
1456         unicode->utf8_length = 0;
1457         if (kind == PyUnicode_2BYTE_KIND)
1458             ((Py_UCS2*)data)[size] = 0;
1459         else /* kind == PyUnicode_4BYTE_KIND */
1460             ((Py_UCS4*)data)[size] = 0;
1461         if (is_sharing) {
1462             _PyUnicode_WSTR_LENGTH(unicode) = size;
1463             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464         }
1465         else {
1466             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467             _PyUnicode_WSTR(unicode) = NULL;
1468         }
1469     }
1470 #ifdef Py_DEBUG
1471     unicode_fill_invalid((PyObject*)unicode, 0);
1472 #endif
1473     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474     return obj;
1475 }
1476 
1477 #if SIZEOF_WCHAR_T == 2
1478 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479    will decode surrogate pairs, the other conversions are implemented as macros
1480    for efficiency.
1481 
1482    This function assumes that unicode can hold one more code point than wstr
1483    characters for a terminating null character. */
1484 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486                               PyObject *unicode)
1487 {
1488     const wchar_t *iter;
1489     Py_UCS4 *ucs4_out;
1490 
1491     assert(unicode != NULL);
1492     assert(_PyUnicode_CHECK(unicode));
1493     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495 
1496     for (iter = begin; iter < end; ) {
1497         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498                            _PyUnicode_GET_LENGTH(unicode)));
1499         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500             && (iter+1) < end
1501             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502         {
1503             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504             iter += 2;
1505         }
1506         else {
1507             *ucs4_out++ = *iter;
1508             iter++;
1509         }
1510     }
1511     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512                         _PyUnicode_GET_LENGTH(unicode)));
1513 
1514 }
1515 #endif
1516 
1517 static int
unicode_check_modifiable(PyObject * unicode)1518 unicode_check_modifiable(PyObject *unicode)
1519 {
1520     if (!unicode_modifiable(unicode)) {
1521         PyErr_SetString(PyExc_SystemError,
1522                         "Cannot modify a string currently used");
1523         return -1;
1524     }
1525     return 0;
1526 }
1527 
1528 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529 _copy_characters(PyObject *to, Py_ssize_t to_start,
1530                  PyObject *from, Py_ssize_t from_start,
1531                  Py_ssize_t how_many, int check_maxchar)
1532 {
1533     unsigned int from_kind, to_kind;
1534     const void *from_data;
1535     void *to_data;
1536 
1537     assert(0 <= how_many);
1538     assert(0 <= from_start);
1539     assert(0 <= to_start);
1540     assert(PyUnicode_Check(from));
1541     assert(PyUnicode_IS_READY(from));
1542     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543 
1544     assert(PyUnicode_Check(to));
1545     assert(PyUnicode_IS_READY(to));
1546     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547 
1548     if (how_many == 0)
1549         return 0;
1550 
1551     from_kind = PyUnicode_KIND(from);
1552     from_data = PyUnicode_DATA(from);
1553     to_kind = PyUnicode_KIND(to);
1554     to_data = PyUnicode_DATA(to);
1555 
1556 #ifdef Py_DEBUG
1557     if (!check_maxchar
1558         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559     {
1560         Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561         Py_UCS4 ch;
1562         Py_ssize_t i;
1563         for (i=0; i < how_many; i++) {
1564             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565             assert(ch <= to_maxchar);
1566         }
1567     }
1568 #endif
1569 
1570     if (from_kind == to_kind) {
1571         if (check_maxchar
1572             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573         {
1574             /* Writing Latin-1 characters into an ASCII string requires to
1575                check that all written characters are pure ASCII */
1576             Py_UCS4 max_char;
1577             max_char = ucs1lib_find_max_char(from_data,
1578                                              (const Py_UCS1*)from_data + how_many);
1579             if (max_char >= 128)
1580                 return -1;
1581         }
1582         memcpy((char*)to_data + to_kind * to_start,
1583                   (const char*)from_data + from_kind * from_start,
1584                   to_kind * how_many);
1585     }
1586     else if (from_kind == PyUnicode_1BYTE_KIND
1587              && to_kind == PyUnicode_2BYTE_KIND)
1588     {
1589         _PyUnicode_CONVERT_BYTES(
1590             Py_UCS1, Py_UCS2,
1591             PyUnicode_1BYTE_DATA(from) + from_start,
1592             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593             PyUnicode_2BYTE_DATA(to) + to_start
1594             );
1595     }
1596     else if (from_kind == PyUnicode_1BYTE_KIND
1597              && to_kind == PyUnicode_4BYTE_KIND)
1598     {
1599         _PyUnicode_CONVERT_BYTES(
1600             Py_UCS1, Py_UCS4,
1601             PyUnicode_1BYTE_DATA(from) + from_start,
1602             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603             PyUnicode_4BYTE_DATA(to) + to_start
1604             );
1605     }
1606     else if (from_kind == PyUnicode_2BYTE_KIND
1607              && to_kind == PyUnicode_4BYTE_KIND)
1608     {
1609         _PyUnicode_CONVERT_BYTES(
1610             Py_UCS2, Py_UCS4,
1611             PyUnicode_2BYTE_DATA(from) + from_start,
1612             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613             PyUnicode_4BYTE_DATA(to) + to_start
1614             );
1615     }
1616     else {
1617         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618 
1619         if (!check_maxchar) {
1620             if (from_kind == PyUnicode_2BYTE_KIND
1621                 && to_kind == PyUnicode_1BYTE_KIND)
1622             {
1623                 _PyUnicode_CONVERT_BYTES(
1624                     Py_UCS2, Py_UCS1,
1625                     PyUnicode_2BYTE_DATA(from) + from_start,
1626                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627                     PyUnicode_1BYTE_DATA(to) + to_start
1628                     );
1629             }
1630             else if (from_kind == PyUnicode_4BYTE_KIND
1631                      && to_kind == PyUnicode_1BYTE_KIND)
1632             {
1633                 _PyUnicode_CONVERT_BYTES(
1634                     Py_UCS4, Py_UCS1,
1635                     PyUnicode_4BYTE_DATA(from) + from_start,
1636                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637                     PyUnicode_1BYTE_DATA(to) + to_start
1638                     );
1639             }
1640             else if (from_kind == PyUnicode_4BYTE_KIND
1641                      && to_kind == PyUnicode_2BYTE_KIND)
1642             {
1643                 _PyUnicode_CONVERT_BYTES(
1644                     Py_UCS4, Py_UCS2,
1645                     PyUnicode_4BYTE_DATA(from) + from_start,
1646                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647                     PyUnicode_2BYTE_DATA(to) + to_start
1648                     );
1649             }
1650             else {
1651                 Py_UNREACHABLE();
1652             }
1653         }
1654         else {
1655             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656             Py_UCS4 ch;
1657             Py_ssize_t i;
1658 
1659             for (i=0; i < how_many; i++) {
1660                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661                 if (ch > to_maxchar)
1662                     return -1;
1663                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664             }
1665         }
1666     }
1667     return 0;
1668 }
1669 
1670 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671 _PyUnicode_FastCopyCharacters(
1672     PyObject *to, Py_ssize_t to_start,
1673     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674 {
1675     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676 }
1677 
1678 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680                          PyObject *from, Py_ssize_t from_start,
1681                          Py_ssize_t how_many)
1682 {
1683     int err;
1684 
1685     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686         PyErr_BadInternalCall();
1687         return -1;
1688     }
1689 
1690     if (PyUnicode_READY(from) == -1)
1691         return -1;
1692     if (PyUnicode_READY(to) == -1)
1693         return -1;
1694 
1695     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696         PyErr_SetString(PyExc_IndexError, "string index out of range");
1697         return -1;
1698     }
1699     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700         PyErr_SetString(PyExc_IndexError, "string index out of range");
1701         return -1;
1702     }
1703     if (how_many < 0) {
1704         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705         return -1;
1706     }
1707     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709         PyErr_Format(PyExc_SystemError,
1710                      "Cannot write %zi characters at %zi "
1711                      "in a string of %zi characters",
1712                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1713         return -1;
1714     }
1715 
1716     if (how_many == 0)
1717         return 0;
1718 
1719     if (unicode_check_modifiable(to))
1720         return -1;
1721 
1722     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723     if (err) {
1724         PyErr_Format(PyExc_SystemError,
1725                      "Cannot copy %s characters "
1726                      "into a string of %s characters",
1727                      unicode_kind_name(from),
1728                      unicode_kind_name(to));
1729         return -1;
1730     }
1731     return how_many;
1732 }
1733 
1734 /* Find the maximum code point and count the number of surrogate pairs so a
1735    correct string length can be computed before converting a string to UCS4.
1736    This function counts single surrogates as a character and not as a pair.
1737 
1738    Return 0 on success, or -1 on error. */
1739 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742 {
1743     const wchar_t *iter;
1744     Py_UCS4 ch;
1745 
1746     assert(num_surrogates != NULL && maxchar != NULL);
1747     *num_surrogates = 0;
1748     *maxchar = 0;
1749 
1750     for (iter = begin; iter < end; ) {
1751 #if SIZEOF_WCHAR_T == 2
1752         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753             && (iter+1) < end
1754             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755         {
1756             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757             ++(*num_surrogates);
1758             iter += 2;
1759         }
1760         else
1761 #endif
1762         {
1763             ch = *iter;
1764             iter++;
1765         }
1766         if (ch > *maxchar) {
1767             *maxchar = ch;
1768             if (*maxchar > MAX_UNICODE) {
1769                 PyErr_Format(PyExc_ValueError,
1770                              "character U+%x is not in range [U+0000; U+%x]",
1771                              ch, MAX_UNICODE);
1772                 return -1;
1773             }
1774         }
1775     }
1776     return 0;
1777 }
1778 
1779 int
_PyUnicode_Ready(PyObject * unicode)1780 _PyUnicode_Ready(PyObject *unicode)
1781 {
1782     wchar_t *end;
1783     Py_UCS4 maxchar = 0;
1784     Py_ssize_t num_surrogates;
1785 #if SIZEOF_WCHAR_T == 2
1786     Py_ssize_t length_wo_surrogates;
1787 #endif
1788 
1789     /* _PyUnicode_Ready() is only intended for old-style API usage where
1790        strings were created using _PyObject_New() and where no canonical
1791        representation (the str field) has been set yet aka strings
1792        which are not yet ready. */
1793     assert(_PyUnicode_CHECK(unicode));
1794     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795     assert(_PyUnicode_WSTR(unicode) != NULL);
1796     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797     assert(_PyUnicode_UTF8(unicode) == NULL);
1798     /* Actually, it should neither be interned nor be anything else: */
1799     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800 
1801     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803                                 &maxchar, &num_surrogates) == -1)
1804         return -1;
1805 
1806     if (maxchar < 256) {
1807         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808         if (!_PyUnicode_DATA_ANY(unicode)) {
1809             PyErr_NoMemory();
1810             return -1;
1811         }
1812         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813                                 _PyUnicode_WSTR(unicode), end,
1814                                 PyUnicode_1BYTE_DATA(unicode));
1815         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818         if (maxchar < 128) {
1819             _PyUnicode_STATE(unicode).ascii = 1;
1820             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822         }
1823         else {
1824             _PyUnicode_STATE(unicode).ascii = 0;
1825             _PyUnicode_UTF8(unicode) = NULL;
1826             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827         }
1828         PyObject_Free(_PyUnicode_WSTR(unicode));
1829         _PyUnicode_WSTR(unicode) = NULL;
1830         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831     }
1832     /* In this case we might have to convert down from 4-byte native
1833        wchar_t to 2-byte unicode. */
1834     else if (maxchar < 65536) {
1835         assert(num_surrogates == 0 &&
1836                "FindMaxCharAndNumSurrogatePairs() messed up");
1837 
1838 #if SIZEOF_WCHAR_T == 2
1839         /* We can share representations and are done. */
1840         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844         _PyUnicode_UTF8(unicode) = NULL;
1845         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846 #else
1847         /* sizeof(wchar_t) == 4 */
1848         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850         if (!_PyUnicode_DATA_ANY(unicode)) {
1851             PyErr_NoMemory();
1852             return -1;
1853         }
1854         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855                                 _PyUnicode_WSTR(unicode), end,
1856                                 PyUnicode_2BYTE_DATA(unicode));
1857         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860         _PyUnicode_UTF8(unicode) = NULL;
1861         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862         PyObject_Free(_PyUnicode_WSTR(unicode));
1863         _PyUnicode_WSTR(unicode) = NULL;
1864         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865 #endif
1866     }
1867     /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868     else {
1869 #if SIZEOF_WCHAR_T == 2
1870         /* in case the native representation is 2-bytes, we need to allocate a
1871            new normalized 4-byte version. */
1872         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874             PyErr_NoMemory();
1875             return -1;
1876         }
1877         _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878         if (!_PyUnicode_DATA_ANY(unicode)) {
1879             PyErr_NoMemory();
1880             return -1;
1881         }
1882         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884         _PyUnicode_UTF8(unicode) = NULL;
1885         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887         _PyUnicode_STATE(unicode).ready = 1;
1888         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889         PyObject_Free(_PyUnicode_WSTR(unicode));
1890         _PyUnicode_WSTR(unicode) = NULL;
1891         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892 #else
1893         assert(num_surrogates == 0);
1894 
1895         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897         _PyUnicode_UTF8(unicode) = NULL;
1898         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900 #endif
1901         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902     }
1903     _PyUnicode_STATE(unicode).ready = 1;
1904     assert(_PyUnicode_CheckConsistency(unicode, 1));
1905     return 0;
1906 }
1907 
1908 static void
unicode_dealloc(PyObject * unicode)1909 unicode_dealloc(PyObject *unicode)
1910 {
1911 #ifdef Py_DEBUG
1912     if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913         _Py_FatalRefcountError("deallocating an Unicode singleton");
1914     }
1915 #endif
1916 
1917     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918     case SSTATE_NOT_INTERNED:
1919         break;
1920     case SSTATE_INTERNED_MORTAL:
1921     {
1922         /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923            references (key and value) which were ignored by
1924            PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925            to prevent calling unicode_dealloc() again. Adjust refcnt after
1926            PyDict_DelItem(). */
1927         assert(Py_REFCNT(unicode) == 0);
1928         Py_SET_REFCNT(unicode, 3);
1929         if (PyDict_DelItem(interned, unicode) != 0) {
1930             _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931                                       NULL);
1932         }
1933         assert(Py_REFCNT(unicode) == 1);
1934         Py_SET_REFCNT(unicode, 0);
1935         break;
1936     }
1937 
1938     case SSTATE_INTERNED_IMMORTAL:
1939         _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940         break;
1941 
1942     default:
1943         Py_UNREACHABLE();
1944     }
1945 
1946     if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947         PyObject_Free(_PyUnicode_WSTR(unicode));
1948     }
1949     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950         PyObject_Free(_PyUnicode_UTF8(unicode));
1951     }
1952     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953         PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954     }
1955 
1956     Py_TYPE(unicode)->tp_free(unicode);
1957 }
1958 
1959 #ifdef Py_DEBUG
1960 static int
unicode_is_singleton(PyObject * unicode)1961 unicode_is_singleton(PyObject *unicode)
1962 {
1963     if (unicode == &_Py_STR(empty)) {
1964         return 1;
1965     }
1966 
1967     PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970         if (ch < 256 && LATIN1(ch) == unicode) {
1971             return 1;
1972         }
1973     }
1974     return 0;
1975 }
1976 #endif
1977 
1978 static int
unicode_modifiable(PyObject * unicode)1979 unicode_modifiable(PyObject *unicode)
1980 {
1981     assert(_PyUnicode_CHECK(unicode));
1982     if (Py_REFCNT(unicode) != 1)
1983         return 0;
1984     if (_PyUnicode_HASH(unicode) != -1)
1985         return 0;
1986     if (PyUnicode_CHECK_INTERNED(unicode))
1987         return 0;
1988     if (!PyUnicode_CheckExact(unicode))
1989         return 0;
1990 #ifdef Py_DEBUG
1991     /* singleton refcount is greater than 1 */
1992     assert(!unicode_is_singleton(unicode));
1993 #endif
1994     return 1;
1995 }
1996 
1997 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999 {
2000     PyObject *unicode;
2001     Py_ssize_t old_length;
2002 
2003     assert(p_unicode != NULL);
2004     unicode = *p_unicode;
2005 
2006     assert(unicode != NULL);
2007     assert(PyUnicode_Check(unicode));
2008     assert(0 <= length);
2009 
2010     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011         old_length = PyUnicode_WSTR_LENGTH(unicode);
2012     else
2013         old_length = PyUnicode_GET_LENGTH(unicode);
2014     if (old_length == length)
2015         return 0;
2016 
2017     if (length == 0) {
2018         PyObject *empty = unicode_new_empty();
2019         Py_SETREF(*p_unicode, empty);
2020         return 0;
2021     }
2022 
2023     if (!unicode_modifiable(unicode)) {
2024         PyObject *copy = resize_copy(unicode, length);
2025         if (copy == NULL)
2026             return -1;
2027         Py_SETREF(*p_unicode, copy);
2028         return 0;
2029     }
2030 
2031     if (PyUnicode_IS_COMPACT(unicode)) {
2032         PyObject *new_unicode = resize_compact(unicode, length);
2033         if (new_unicode == NULL)
2034             return -1;
2035         *p_unicode = new_unicode;
2036         return 0;
2037     }
2038     return resize_inplace(unicode, length);
2039 }
2040 
2041 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043 {
2044     PyObject *unicode;
2045     if (p_unicode == NULL) {
2046         PyErr_BadInternalCall();
2047         return -1;
2048     }
2049     unicode = *p_unicode;
2050     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051     {
2052         PyErr_BadInternalCall();
2053         return -1;
2054     }
2055     return unicode_resize(p_unicode, length);
2056 }
2057 
2058 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059 
2060    WARNING: The function doesn't copy the terminating null character and
2061    doesn't check the maximum character (may write a latin1 character in an
2062    ASCII string). */
2063 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065                    const char *str, Py_ssize_t len)
2066 {
2067     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068     const void *data = PyUnicode_DATA(unicode);
2069     const char *end = str + len;
2070 
2071     assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072     switch (kind) {
2073     case PyUnicode_1BYTE_KIND: {
2074 #ifdef Py_DEBUG
2075         if (PyUnicode_IS_ASCII(unicode)) {
2076             Py_UCS4 maxchar = ucs1lib_find_max_char(
2077                 (const Py_UCS1*)str,
2078                 (const Py_UCS1*)str + len);
2079             assert(maxchar < 128);
2080         }
2081 #endif
2082         memcpy((char *) data + index, str, len);
2083         break;
2084     }
2085     case PyUnicode_2BYTE_KIND: {
2086         Py_UCS2 *start = (Py_UCS2 *)data + index;
2087         Py_UCS2 *ucs2 = start;
2088 
2089         for (; str < end; ++ucs2, ++str)
2090             *ucs2 = (Py_UCS2)*str;
2091 
2092         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093         break;
2094     }
2095     case PyUnicode_4BYTE_KIND: {
2096         Py_UCS4 *start = (Py_UCS4 *)data + index;
2097         Py_UCS4 *ucs4 = start;
2098 
2099         for (; str < end; ++ucs4, ++str)
2100             *ucs4 = (Py_UCS4)*str;
2101 
2102         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103         break;
2104     }
2105     default:
2106         Py_UNREACHABLE();
2107     }
2108 }
2109 
2110 static PyObject*
get_latin1_char(Py_UCS1 ch)2111 get_latin1_char(Py_UCS1 ch)
2112 {
2113     return Py_NewRef(LATIN1(ch));
2114 }
2115 
2116 static PyObject*
unicode_char(Py_UCS4 ch)2117 unicode_char(Py_UCS4 ch)
2118 {
2119     PyObject *unicode;
2120 
2121     assert(ch <= MAX_UNICODE);
2122 
2123     if (ch < 256) {
2124         return get_latin1_char(ch);
2125     }
2126 
2127     unicode = PyUnicode_New(1, ch);
2128     if (unicode == NULL)
2129         return NULL;
2130 
2131     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134     } else {
2135         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137     }
2138     assert(_PyUnicode_CheckConsistency(unicode, 1));
2139     return unicode;
2140 }
2141 
2142 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144 {
2145     if (u == NULL) {
2146         if (size > 0) {
2147             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148                     "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149                     "use PyUnicode_New() instead", 1) < 0) {
2150                 return NULL;
2151             }
2152         }
2153         return (PyObject*)_PyUnicode_New(size);
2154     }
2155 
2156     if (size < 0) {
2157         PyErr_BadInternalCall();
2158         return NULL;
2159     }
2160 
2161     return PyUnicode_FromWideChar(u, size);
2162 }
2163 
2164 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166 {
2167     PyObject *unicode;
2168     Py_UCS4 maxchar = 0;
2169     Py_ssize_t num_surrogates;
2170 
2171     if (u == NULL && size != 0) {
2172         PyErr_BadInternalCall();
2173         return NULL;
2174     }
2175 
2176     if (size == -1) {
2177         size = wcslen(u);
2178     }
2179 
2180     /* If the Unicode data is known at construction time, we can apply
2181        some optimizations which share commonly used objects. */
2182 
2183     /* Optimization for empty strings */
2184     if (size == 0)
2185         _Py_RETURN_UNICODE_EMPTY();
2186 
2187 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188     /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189        non-Unicode locales and hence needs conversion to UCS-4 first. */
2190     if (_Py_LocaleUsesNonUnicodeWchar()) {
2191         wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192         if (!converted) {
2193             return NULL;
2194         }
2195         PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196         PyMem_Free(converted);
2197         return unicode;
2198     }
2199 #endif
2200 
2201     /* Single character Unicode objects in the Latin-1 range are
2202        shared when using this constructor */
2203     if (size == 1 && (Py_UCS4)*u < 256)
2204         return get_latin1_char((unsigned char)*u);
2205 
2206     /* If not empty and not single character, copy the Unicode data
2207        into the new object */
2208     if (find_maxchar_surrogates(u, u + size,
2209                                 &maxchar, &num_surrogates) == -1)
2210         return NULL;
2211 
2212     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213     if (!unicode)
2214         return NULL;
2215 
2216     switch (PyUnicode_KIND(unicode)) {
2217     case PyUnicode_1BYTE_KIND:
2218         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220         break;
2221     case PyUnicode_2BYTE_KIND:
2222 #if Py_UNICODE_SIZE == 2
2223         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224 #else
2225         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227 #endif
2228         break;
2229     case PyUnicode_4BYTE_KIND:
2230 #if SIZEOF_WCHAR_T == 2
2231         /* This is the only case which has to process surrogates, thus
2232            a simple copy loop is not enough and we need a function. */
2233         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234 #else
2235         assert(num_surrogates == 0);
2236         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237 #endif
2238         break;
2239     default:
2240         Py_UNREACHABLE();
2241     }
2242 
2243     return unicode_result(unicode);
2244 }
2245 
2246 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248 {
2249     if (size < 0) {
2250         PyErr_SetString(PyExc_SystemError,
2251                         "Negative size passed to PyUnicode_FromStringAndSize");
2252         return NULL;
2253     }
2254     if (u != NULL) {
2255         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256     }
2257     else {
2258         if (size > 0) {
2259             if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260                     "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261                     "use PyUnicode_New() instead", 1) < 0) {
2262                 return NULL;
2263             }
2264         }
2265         return (PyObject *)_PyUnicode_New(size);
2266     }
2267 }
2268 
2269 PyObject *
PyUnicode_FromString(const char * u)2270 PyUnicode_FromString(const char *u)
2271 {
2272     size_t size = strlen(u);
2273     if (size > PY_SSIZE_T_MAX) {
2274         PyErr_SetString(PyExc_OverflowError, "input too long");
2275         return NULL;
2276     }
2277     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278 }
2279 
2280 
2281 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2282 _PyUnicode_FromId(_Py_Identifier *id)
2283 {
2284     PyInterpreterState *interp = _PyInterpreterState_GET();
2285     struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286 
2287     Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288     if (index < 0) {
2289         struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290 
2291         PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292         // Check again to detect concurrent access. Another thread can have
2293         // initialized the index while this thread waited for the lock.
2294         index = _Py_atomic_size_get(&id->index);
2295         if (index < 0) {
2296             assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297             index = rt_ids->next_index;
2298             rt_ids->next_index++;
2299             _Py_atomic_size_set(&id->index, index);
2300         }
2301         PyThread_release_lock(rt_ids->lock);
2302     }
2303     assert(index >= 0);
2304 
2305     PyObject *obj;
2306     if (index < ids->size) {
2307         obj = ids->array[index];
2308         if (obj) {
2309             // Return a borrowed reference
2310             return obj;
2311         }
2312     }
2313 
2314     obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315                                        NULL, NULL);
2316     if (!obj) {
2317         return NULL;
2318     }
2319     PyUnicode_InternInPlace(&obj);
2320 
2321     if (index >= ids->size) {
2322         // Overallocate to reduce the number of realloc
2323         Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324         Py_ssize_t item_size = sizeof(ids->array[0]);
2325         PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326         if (new_array == NULL) {
2327             PyErr_NoMemory();
2328             return NULL;
2329         }
2330         memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331         ids->array = new_array;
2332         ids->size = new_size;
2333     }
2334 
2335     // The array stores a strong reference
2336     ids->array[index] = obj;
2337 
2338     // Return a borrowed reference
2339     return obj;
2340 }
2341 
2342 
2343 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2344 unicode_clear_identifiers(struct _Py_unicode_state *state)
2345 {
2346     struct _Py_unicode_ids *ids = &state->ids;
2347     for (Py_ssize_t i=0; i < ids->size; i++) {
2348         Py_XDECREF(ids->array[i]);
2349     }
2350     ids->size = 0;
2351     PyMem_Free(ids->array);
2352     ids->array = NULL;
2353     // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354     // after Py_Finalize().
2355 }
2356 
2357 
2358 /* Internal function, doesn't check maximum character */
2359 
2360 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362 {
2363     const unsigned char *s = (const unsigned char *)buffer;
2364     PyObject *unicode;
2365     if (size == 1) {
2366 #ifdef Py_DEBUG
2367         assert((unsigned char)s[0] < 128);
2368 #endif
2369         return get_latin1_char(s[0]);
2370     }
2371     unicode = PyUnicode_New(size, 127);
2372     if (!unicode)
2373         return NULL;
2374     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375     assert(_PyUnicode_CheckConsistency(unicode, 1));
2376     return unicode;
2377 }
2378 
2379 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2380 kind_maxchar_limit(unsigned int kind)
2381 {
2382     switch (kind) {
2383     case PyUnicode_1BYTE_KIND:
2384         return 0x80;
2385     case PyUnicode_2BYTE_KIND:
2386         return 0x100;
2387     case PyUnicode_4BYTE_KIND:
2388         return 0x10000;
2389     default:
2390         Py_UNREACHABLE();
2391     }
2392 }
2393 
2394 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396 {
2397     PyObject *res;
2398     unsigned char max_char;
2399 
2400     if (size == 0) {
2401         _Py_RETURN_UNICODE_EMPTY();
2402     }
2403     assert(size > 0);
2404     if (size == 1) {
2405         return get_latin1_char(u[0]);
2406     }
2407 
2408     max_char = ucs1lib_find_max_char(u, u + size);
2409     res = PyUnicode_New(size, max_char);
2410     if (!res)
2411         return NULL;
2412     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413     assert(_PyUnicode_CheckConsistency(res, 1));
2414     return res;
2415 }
2416 
2417 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419 {
2420     PyObject *res;
2421     Py_UCS2 max_char;
2422 
2423     if (size == 0)
2424         _Py_RETURN_UNICODE_EMPTY();
2425     assert(size > 0);
2426     if (size == 1)
2427         return unicode_char(u[0]);
2428 
2429     max_char = ucs2lib_find_max_char(u, u + size);
2430     res = PyUnicode_New(size, max_char);
2431     if (!res)
2432         return NULL;
2433     if (max_char >= 256)
2434         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435     else {
2436         _PyUnicode_CONVERT_BYTES(
2437             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438     }
2439     assert(_PyUnicode_CheckConsistency(res, 1));
2440     return res;
2441 }
2442 
2443 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445 {
2446     PyObject *res;
2447     Py_UCS4 max_char;
2448 
2449     if (size == 0)
2450         _Py_RETURN_UNICODE_EMPTY();
2451     assert(size > 0);
2452     if (size == 1)
2453         return unicode_char(u[0]);
2454 
2455     max_char = ucs4lib_find_max_char(u, u + size);
2456     res = PyUnicode_New(size, max_char);
2457     if (!res)
2458         return NULL;
2459     if (max_char < 256)
2460         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461                                  PyUnicode_1BYTE_DATA(res));
2462     else if (max_char < 0x10000)
2463         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464                                  PyUnicode_2BYTE_DATA(res));
2465     else
2466         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467     assert(_PyUnicode_CheckConsistency(res, 1));
2468     return res;
2469 }
2470 
2471 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473 {
2474     if (size < 0) {
2475         PyErr_SetString(PyExc_ValueError, "size must be positive");
2476         return NULL;
2477     }
2478     switch (kind) {
2479     case PyUnicode_1BYTE_KIND:
2480         return _PyUnicode_FromUCS1(buffer, size);
2481     case PyUnicode_2BYTE_KIND:
2482         return _PyUnicode_FromUCS2(buffer, size);
2483     case PyUnicode_4BYTE_KIND:
2484         return _PyUnicode_FromUCS4(buffer, size);
2485     default:
2486         PyErr_SetString(PyExc_SystemError, "invalid kind");
2487         return NULL;
2488     }
2489 }
2490 
2491 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493 {
2494     enum PyUnicode_Kind kind;
2495     const void *startptr, *endptr;
2496 
2497     assert(PyUnicode_IS_READY(unicode));
2498     assert(0 <= start);
2499     assert(end <= PyUnicode_GET_LENGTH(unicode));
2500     assert(start <= end);
2501 
2502     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503         return PyUnicode_MAX_CHAR_VALUE(unicode);
2504 
2505     if (start == end)
2506         return 127;
2507 
2508     if (PyUnicode_IS_ASCII(unicode))
2509         return 127;
2510 
2511     kind = PyUnicode_KIND(unicode);
2512     startptr = PyUnicode_DATA(unicode);
2513     endptr = (char *)startptr + end * kind;
2514     startptr = (char *)startptr + start * kind;
2515     switch(kind) {
2516     case PyUnicode_1BYTE_KIND:
2517         return ucs1lib_find_max_char(startptr, endptr);
2518     case PyUnicode_2BYTE_KIND:
2519         return ucs2lib_find_max_char(startptr, endptr);
2520     case PyUnicode_4BYTE_KIND:
2521         return ucs4lib_find_max_char(startptr, endptr);
2522     default:
2523         Py_UNREACHABLE();
2524     }
2525 }
2526 
2527 /* Ensure that a string uses the most efficient storage, if it is not the
2528    case: create a new string with of the right kind. Write NULL into *p_unicode
2529    on error. */
2530 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2531 unicode_adjust_maxchar(PyObject **p_unicode)
2532 {
2533     PyObject *unicode, *copy;
2534     Py_UCS4 max_char;
2535     Py_ssize_t len;
2536     unsigned int kind;
2537 
2538     assert(p_unicode != NULL);
2539     unicode = *p_unicode;
2540     assert(PyUnicode_IS_READY(unicode));
2541     if (PyUnicode_IS_ASCII(unicode))
2542         return;
2543 
2544     len = PyUnicode_GET_LENGTH(unicode);
2545     kind = PyUnicode_KIND(unicode);
2546     if (kind == PyUnicode_1BYTE_KIND) {
2547         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548         max_char = ucs1lib_find_max_char(u, u + len);
2549         if (max_char >= 128)
2550             return;
2551     }
2552     else if (kind == PyUnicode_2BYTE_KIND) {
2553         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554         max_char = ucs2lib_find_max_char(u, u + len);
2555         if (max_char >= 256)
2556             return;
2557     }
2558     else if (kind == PyUnicode_4BYTE_KIND) {
2559         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560         max_char = ucs4lib_find_max_char(u, u + len);
2561         if (max_char >= 0x10000)
2562             return;
2563     }
2564     else
2565         Py_UNREACHABLE();
2566 
2567     copy = PyUnicode_New(len, max_char);
2568     if (copy != NULL)
2569         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570     Py_DECREF(unicode);
2571     *p_unicode = copy;
2572 }
2573 
2574 PyObject*
_PyUnicode_Copy(PyObject * unicode)2575 _PyUnicode_Copy(PyObject *unicode)
2576 {
2577     Py_ssize_t length;
2578     PyObject *copy;
2579 
2580     if (!PyUnicode_Check(unicode)) {
2581         PyErr_BadInternalCall();
2582         return NULL;
2583     }
2584     if (PyUnicode_READY(unicode) == -1)
2585         return NULL;
2586 
2587     length = PyUnicode_GET_LENGTH(unicode);
2588     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589     if (!copy)
2590         return NULL;
2591     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592 
2593     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594               length * PyUnicode_KIND(unicode));
2595     assert(_PyUnicode_CheckConsistency(copy, 1));
2596     return copy;
2597 }
2598 
2599 
2600 /* Widen Unicode objects to larger buffers. Don't write terminating null
2601    character. Return NULL on error. */
2602 
2603 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605 {
2606     void *result;
2607 
2608     assert(skind < kind);
2609     switch (kind) {
2610     case PyUnicode_2BYTE_KIND:
2611         result = PyMem_New(Py_UCS2, len);
2612         if (!result)
2613             return PyErr_NoMemory();
2614         assert(skind == PyUnicode_1BYTE_KIND);
2615         _PyUnicode_CONVERT_BYTES(
2616             Py_UCS1, Py_UCS2,
2617             (const Py_UCS1 *)data,
2618             ((const Py_UCS1 *)data) + len,
2619             result);
2620         return result;
2621     case PyUnicode_4BYTE_KIND:
2622         result = PyMem_New(Py_UCS4, len);
2623         if (!result)
2624             return PyErr_NoMemory();
2625         if (skind == PyUnicode_2BYTE_KIND) {
2626             _PyUnicode_CONVERT_BYTES(
2627                 Py_UCS2, Py_UCS4,
2628                 (const Py_UCS2 *)data,
2629                 ((const Py_UCS2 *)data) + len,
2630                 result);
2631         }
2632         else {
2633             assert(skind == PyUnicode_1BYTE_KIND);
2634             _PyUnicode_CONVERT_BYTES(
2635                 Py_UCS1, Py_UCS4,
2636                 (const Py_UCS1 *)data,
2637                 ((const Py_UCS1 *)data) + len,
2638                 result);
2639         }
2640         return result;
2641     default:
2642         Py_UNREACHABLE();
2643         return NULL;
2644     }
2645 }
2646 
2647 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649         int copy_null)
2650 {
2651     int kind;
2652     const void *data;
2653     Py_ssize_t len, targetlen;
2654     if (PyUnicode_READY(string) == -1)
2655         return NULL;
2656     kind = PyUnicode_KIND(string);
2657     data = PyUnicode_DATA(string);
2658     len = PyUnicode_GET_LENGTH(string);
2659     targetlen = len;
2660     if (copy_null)
2661         targetlen++;
2662     if (!target) {
2663         target = PyMem_New(Py_UCS4, targetlen);
2664         if (!target) {
2665             PyErr_NoMemory();
2666             return NULL;
2667         }
2668     }
2669     else {
2670         if (targetsize < targetlen) {
2671             PyErr_Format(PyExc_SystemError,
2672                          "string is longer than the buffer");
2673             if (copy_null && 0 < targetsize)
2674                 target[0] = 0;
2675             return NULL;
2676         }
2677     }
2678     if (kind == PyUnicode_1BYTE_KIND) {
2679         const Py_UCS1 *start = (const Py_UCS1 *) data;
2680         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681     }
2682     else if (kind == PyUnicode_2BYTE_KIND) {
2683         const Py_UCS2 *start = (const Py_UCS2 *) data;
2684         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685     }
2686     else if (kind == PyUnicode_4BYTE_KIND) {
2687         memcpy(target, data, len * sizeof(Py_UCS4));
2688     }
2689     else {
2690         Py_UNREACHABLE();
2691     }
2692     if (copy_null)
2693         target[len] = 0;
2694     return target;
2695 }
2696 
2697 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699                  int copy_null)
2700 {
2701     if (target == NULL || targetsize < 0) {
2702         PyErr_BadInternalCall();
2703         return NULL;
2704     }
2705     return as_ucs4(string, target, targetsize, copy_null);
2706 }
2707 
2708 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2709 PyUnicode_AsUCS4Copy(PyObject *string)
2710 {
2711     return as_ucs4(string, NULL, 0, 1);
2712 }
2713 
2714 /* maximum number of characters required for output of %lld or %p.
2715    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2717 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718 
2719 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721                              Py_ssize_t width, Py_ssize_t precision)
2722 {
2723     Py_ssize_t length, fill, arglen;
2724     Py_UCS4 maxchar;
2725 
2726     if (PyUnicode_READY(str) == -1)
2727         return -1;
2728 
2729     length = PyUnicode_GET_LENGTH(str);
2730     if ((precision == -1 || precision >= length)
2731         && width <= length)
2732         return _PyUnicodeWriter_WriteStr(writer, str);
2733 
2734     if (precision != -1)
2735         length = Py_MIN(precision, length);
2736 
2737     arglen = Py_MAX(length, width);
2738     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740     else
2741         maxchar = writer->maxchar;
2742 
2743     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744         return -1;
2745 
2746     if (width > length) {
2747         fill = width - length;
2748         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749             return -1;
2750         writer->pos += fill;
2751     }
2752 
2753     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754                                   str, 0, length);
2755     writer->pos += length;
2756     return 0;
2757 }
2758 
2759 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761                               Py_ssize_t width, Py_ssize_t precision)
2762 {
2763     /* UTF-8 */
2764     Py_ssize_t length;
2765     PyObject *unicode;
2766     int res;
2767 
2768     if (precision == -1) {
2769         length = strlen(str);
2770     }
2771     else {
2772         length = 0;
2773         while (length < precision && str[length]) {
2774             length++;
2775         }
2776     }
2777     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778     if (unicode == NULL)
2779         return -1;
2780 
2781     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782     Py_DECREF(unicode);
2783     return res;
2784 }
2785 
2786 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788                        const char *f, va_list *vargs)
2789 {
2790     const char *p;
2791     Py_ssize_t len;
2792     int zeropad;
2793     Py_ssize_t width;
2794     Py_ssize_t precision;
2795     int longflag;
2796     int longlongflag;
2797     int size_tflag;
2798     Py_ssize_t fill;
2799 
2800     p = f;
2801     f++;
2802     zeropad = 0;
2803     if (*f == '0') {
2804         zeropad = 1;
2805         f++;
2806     }
2807 
2808     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809     width = -1;
2810     if (Py_ISDIGIT((unsigned)*f)) {
2811         width = *f - '0';
2812         f++;
2813         while (Py_ISDIGIT((unsigned)*f)) {
2814             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815                 PyErr_SetString(PyExc_ValueError,
2816                                 "width too big");
2817                 return NULL;
2818             }
2819             width = (width * 10) + (*f - '0');
2820             f++;
2821         }
2822     }
2823     precision = -1;
2824     if (*f == '.') {
2825         f++;
2826         if (Py_ISDIGIT((unsigned)*f)) {
2827             precision = (*f - '0');
2828             f++;
2829             while (Py_ISDIGIT((unsigned)*f)) {
2830                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831                     PyErr_SetString(PyExc_ValueError,
2832                                     "precision too big");
2833                     return NULL;
2834                 }
2835                 precision = (precision * 10) + (*f - '0');
2836                 f++;
2837             }
2838         }
2839         if (*f == '%') {
2840             /* "%.3%s" => f points to "3" */
2841             f--;
2842         }
2843     }
2844     if (*f == '\0') {
2845         /* bogus format "%.123" => go backward, f points to "3" */
2846         f--;
2847     }
2848 
2849     /* Handle %ld, %lu, %lld and %llu. */
2850     longflag = 0;
2851     longlongflag = 0;
2852     size_tflag = 0;
2853     if (*f == 'l') {
2854         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855             longflag = 1;
2856             ++f;
2857         }
2858         else if (f[1] == 'l' &&
2859                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860             longlongflag = 1;
2861             f += 2;
2862         }
2863     }
2864     /* handle the size_t flag. */
2865     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866         size_tflag = 1;
2867         ++f;
2868     }
2869 
2870     if (f[1] == '\0')
2871         writer->overallocate = 0;
2872 
2873     switch (*f) {
2874     case 'c':
2875     {
2876         int ordinal = va_arg(*vargs, int);
2877         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878             PyErr_SetString(PyExc_OverflowError,
2879                             "character argument not in range(0x110000)");
2880             return NULL;
2881         }
2882         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883             return NULL;
2884         break;
2885     }
2886 
2887     case 'i':
2888     case 'd':
2889     case 'u':
2890     case 'x':
2891     {
2892         /* used by sprintf */
2893         char buffer[MAX_LONG_LONG_CHARS];
2894         Py_ssize_t arglen;
2895 
2896         if (*f == 'u') {
2897             if (longflag) {
2898                 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899             }
2900             else if (longlongflag) {
2901                 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902             }
2903             else if (size_tflag) {
2904                 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905             }
2906             else {
2907                 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908             }
2909         }
2910         else if (*f == 'x') {
2911             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912         }
2913         else {
2914             if (longflag) {
2915                 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916             }
2917             else if (longlongflag) {
2918                 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919             }
2920             else if (size_tflag) {
2921                 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922             }
2923             else {
2924                 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925             }
2926         }
2927         assert(len >= 0);
2928 
2929         if (precision < len)
2930             precision = len;
2931 
2932         arglen = Py_MAX(precision, width);
2933         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934             return NULL;
2935 
2936         if (width > precision) {
2937             Py_UCS4 fillchar;
2938             fill = width - precision;
2939             fillchar = zeropad?'0':' ';
2940             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941                 return NULL;
2942             writer->pos += fill;
2943         }
2944         if (precision > len) {
2945             fill = precision - len;
2946             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947                 return NULL;
2948             writer->pos += fill;
2949         }
2950 
2951         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952             return NULL;
2953         break;
2954     }
2955 
2956     case 'p':
2957     {
2958         char number[MAX_LONG_LONG_CHARS];
2959 
2960         len = sprintf(number, "%p", va_arg(*vargs, void*));
2961         assert(len >= 0);
2962 
2963         /* %p is ill-defined:  ensure leading 0x. */
2964         if (number[1] == 'X')
2965             number[1] = 'x';
2966         else if (number[1] != 'x') {
2967             memmove(number + 2, number,
2968                     strlen(number) + 1);
2969             number[0] = '0';
2970             number[1] = 'x';
2971             len += 2;
2972         }
2973 
2974         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975             return NULL;
2976         break;
2977     }
2978 
2979     case 's':
2980     {
2981         /* UTF-8 */
2982         const char *s = va_arg(*vargs, const char*);
2983         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984             return NULL;
2985         break;
2986     }
2987 
2988     case 'U':
2989     {
2990         PyObject *obj = va_arg(*vargs, PyObject *);
2991         assert(obj && _PyUnicode_CHECK(obj));
2992 
2993         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994             return NULL;
2995         break;
2996     }
2997 
2998     case 'V':
2999     {
3000         PyObject *obj = va_arg(*vargs, PyObject *);
3001         const char *str = va_arg(*vargs, const char *);
3002         if (obj) {
3003             assert(_PyUnicode_CHECK(obj));
3004             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005                 return NULL;
3006         }
3007         else {
3008             assert(str != NULL);
3009             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010                 return NULL;
3011         }
3012         break;
3013     }
3014 
3015     case 'S':
3016     {
3017         PyObject *obj = va_arg(*vargs, PyObject *);
3018         PyObject *str;
3019         assert(obj);
3020         str = PyObject_Str(obj);
3021         if (!str)
3022             return NULL;
3023         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024             Py_DECREF(str);
3025             return NULL;
3026         }
3027         Py_DECREF(str);
3028         break;
3029     }
3030 
3031     case 'R':
3032     {
3033         PyObject *obj = va_arg(*vargs, PyObject *);
3034         PyObject *repr;
3035         assert(obj);
3036         repr = PyObject_Repr(obj);
3037         if (!repr)
3038             return NULL;
3039         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040             Py_DECREF(repr);
3041             return NULL;
3042         }
3043         Py_DECREF(repr);
3044         break;
3045     }
3046 
3047     case 'A':
3048     {
3049         PyObject *obj = va_arg(*vargs, PyObject *);
3050         PyObject *ascii;
3051         assert(obj);
3052         ascii = PyObject_ASCII(obj);
3053         if (!ascii)
3054             return NULL;
3055         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056             Py_DECREF(ascii);
3057             return NULL;
3058         }
3059         Py_DECREF(ascii);
3060         break;
3061     }
3062 
3063     case '%':
3064         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065             return NULL;
3066         break;
3067 
3068     default:
3069         /* if we stumble upon an unknown formatting code, copy the rest
3070            of the format string to the output string. (we cannot just
3071            skip the code, since there's no way to know what's in the
3072            argument list) */
3073         len = strlen(p);
3074         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075             return NULL;
3076         f = p+len;
3077         return f;
3078     }
3079 
3080     f++;
3081     return f;
3082 }
3083 
3084 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3085 PyUnicode_FromFormatV(const char *format, va_list vargs)
3086 {
3087     va_list vargs2;
3088     const char *f;
3089     _PyUnicodeWriter writer;
3090 
3091     _PyUnicodeWriter_Init(&writer);
3092     writer.min_length = strlen(format) + 100;
3093     writer.overallocate = 1;
3094 
3095     // Copy varags to be able to pass a reference to a subfunction.
3096     va_copy(vargs2, vargs);
3097 
3098     for (f = format; *f; ) {
3099         if (*f == '%') {
3100             f = unicode_fromformat_arg(&writer, f, &vargs2);
3101             if (f == NULL)
3102                 goto fail;
3103         }
3104         else {
3105             const char *p;
3106             Py_ssize_t len;
3107 
3108             p = f;
3109             do
3110             {
3111                 if ((unsigned char)*p > 127) {
3112                     PyErr_Format(PyExc_ValueError,
3113                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114                         "string, got a non-ASCII byte: 0x%02x",
3115                         (unsigned char)*p);
3116                     goto fail;
3117                 }
3118                 p++;
3119             }
3120             while (*p != '\0' && *p != '%');
3121             len = p - f;
3122 
3123             if (*p == '\0')
3124                 writer.overallocate = 0;
3125 
3126             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127                 goto fail;
3128 
3129             f = p;
3130         }
3131     }
3132     va_end(vargs2);
3133     return _PyUnicodeWriter_Finish(&writer);
3134 
3135   fail:
3136     va_end(vargs2);
3137     _PyUnicodeWriter_Dealloc(&writer);
3138     return NULL;
3139 }
3140 
3141 PyObject *
PyUnicode_FromFormat(const char * format,...)3142 PyUnicode_FromFormat(const char *format, ...)
3143 {
3144     PyObject* ret;
3145     va_list vargs;
3146 
3147 #ifdef HAVE_STDARG_PROTOTYPES
3148     va_start(vargs, format);
3149 #else
3150     va_start(vargs);
3151 #endif
3152     ret = PyUnicode_FromFormatV(format, vargs);
3153     va_end(vargs);
3154     return ret;
3155 }
3156 
3157 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3158 unicode_get_widechar_size(PyObject *unicode)
3159 {
3160     Py_ssize_t res;
3161 
3162     assert(unicode != NULL);
3163     assert(_PyUnicode_CHECK(unicode));
3164 
3165 #if USE_UNICODE_WCHAR_CACHE
3166     if (_PyUnicode_WSTR(unicode) != NULL) {
3167         return PyUnicode_WSTR_LENGTH(unicode);
3168     }
3169 #endif /* USE_UNICODE_WCHAR_CACHE */
3170     assert(PyUnicode_IS_READY(unicode));
3171 
3172     res = _PyUnicode_LENGTH(unicode);
3173 #if SIZEOF_WCHAR_T == 2
3174     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176         const Py_UCS4 *end = s + res;
3177         for (; s < end; ++s) {
3178             if (*s > 0xFFFF) {
3179                 ++res;
3180             }
3181         }
3182     }
3183 #endif
3184     return res;
3185 }
3186 
3187 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189 {
3190     assert(unicode != NULL);
3191     assert(_PyUnicode_CHECK(unicode));
3192 
3193 #if USE_UNICODE_WCHAR_CACHE
3194     const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195     if (wstr != NULL) {
3196         memcpy(w, wstr, size * sizeof(wchar_t));
3197         return;
3198     }
3199 #else /* USE_UNICODE_WCHAR_CACHE */
3200     if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201         memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202         return;
3203     }
3204 #endif /* USE_UNICODE_WCHAR_CACHE */
3205     assert(PyUnicode_IS_READY(unicode));
3206 
3207     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209         for (; size--; ++s, ++w) {
3210             *w = *s;
3211         }
3212     }
3213     else {
3214 #if SIZEOF_WCHAR_T == 4
3215         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217         for (; size--; ++s, ++w) {
3218             *w = *s;
3219         }
3220 #else
3221         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223         for (; size--; ++s, ++w) {
3224             Py_UCS4 ch = *s;
3225             if (ch > 0xFFFF) {
3226                 assert(ch <= MAX_UNICODE);
3227                 /* encode surrogate pair in this case */
3228                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229                 if (!size--)
3230                     break;
3231                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3232             }
3233             else {
3234                 *w = ch;
3235             }
3236         }
3237 #endif
3238     }
3239 }
3240 
3241 #ifdef HAVE_WCHAR_H
3242 
3243 /* Convert a Unicode object to a wide character string.
3244 
3245    - If w is NULL: return the number of wide characters (including the null
3246      character) required to convert the unicode object. Ignore size argument.
3247 
3248    - Otherwise: return the number of wide characters (excluding the null
3249      character) written into w. Write at most size wide characters (including
3250      the null character). */
3251 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252 PyUnicode_AsWideChar(PyObject *unicode,
3253                      wchar_t *w,
3254                      Py_ssize_t size)
3255 {
3256     Py_ssize_t res;
3257 
3258     if (unicode == NULL) {
3259         PyErr_BadInternalCall();
3260         return -1;
3261     }
3262     if (!PyUnicode_Check(unicode)) {
3263         PyErr_BadArgument();
3264         return -1;
3265     }
3266 
3267     res = unicode_get_widechar_size(unicode);
3268     if (w == NULL) {
3269         return res + 1;
3270     }
3271 
3272     if (size > res) {
3273         size = res + 1;
3274     }
3275     else {
3276         res = size;
3277     }
3278     unicode_copy_as_widechar(unicode, w, size);
3279 
3280 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282        non-Unicode locales and hence needs conversion first. */
3283     if (_Py_LocaleUsesNonUnicodeWchar()) {
3284         if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285             return -1;
3286         }
3287     }
3288 #endif
3289 
3290     return res;
3291 }
3292 
3293 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294 PyUnicode_AsWideCharString(PyObject *unicode,
3295                            Py_ssize_t *size)
3296 {
3297     wchar_t *buffer;
3298     Py_ssize_t buflen;
3299 
3300     if (unicode == NULL) {
3301         PyErr_BadInternalCall();
3302         return NULL;
3303     }
3304     if (!PyUnicode_Check(unicode)) {
3305         PyErr_BadArgument();
3306         return NULL;
3307     }
3308 
3309     buflen = unicode_get_widechar_size(unicode);
3310     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311     if (buffer == NULL) {
3312         PyErr_NoMemory();
3313         return NULL;
3314     }
3315     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316 
3317 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318     /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319        non-Unicode locales and hence needs conversion first. */
3320     if (_Py_LocaleUsesNonUnicodeWchar()) {
3321         if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322             return NULL;
3323         }
3324     }
3325 #endif
3326 
3327     if (size != NULL) {
3328         *size = buflen;
3329     }
3330     else if (wcslen(buffer) != (size_t)buflen) {
3331         PyMem_Free(buffer);
3332         PyErr_SetString(PyExc_ValueError,
3333                         "embedded null character");
3334         return NULL;
3335     }
3336     return buffer;
3337 }
3338 
3339 #endif /* HAVE_WCHAR_H */
3340 
3341 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343 {
3344     wchar_t **p = (wchar_t **)ptr;
3345     if (obj == NULL) {
3346 #if !USE_UNICODE_WCHAR_CACHE
3347         PyMem_Free(*p);
3348 #endif /* USE_UNICODE_WCHAR_CACHE */
3349         *p = NULL;
3350         return 1;
3351     }
3352     if (PyUnicode_Check(obj)) {
3353 #if USE_UNICODE_WCHAR_CACHE
3354         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355         if (*p == NULL) {
3356             return 0;
3357         }
3358         return 1;
3359 #else /* USE_UNICODE_WCHAR_CACHE */
3360         *p = PyUnicode_AsWideCharString(obj, NULL);
3361         if (*p == NULL) {
3362             return 0;
3363         }
3364         return Py_CLEANUP_SUPPORTED;
3365 #endif /* USE_UNICODE_WCHAR_CACHE */
3366     }
3367     PyErr_Format(PyExc_TypeError,
3368                  "argument must be str, not %.50s",
3369                  Py_TYPE(obj)->tp_name);
3370     return 0;
3371 }
3372 
3373 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375 {
3376     wchar_t **p = (wchar_t **)ptr;
3377     if (obj == NULL) {
3378 #if !USE_UNICODE_WCHAR_CACHE
3379         PyMem_Free(*p);
3380 #endif /* USE_UNICODE_WCHAR_CACHE */
3381         *p = NULL;
3382         return 1;
3383     }
3384     if (obj == Py_None) {
3385         *p = NULL;
3386         return 1;
3387     }
3388     if (PyUnicode_Check(obj)) {
3389 #if USE_UNICODE_WCHAR_CACHE
3390         *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391         if (*p == NULL) {
3392             return 0;
3393         }
3394         return 1;
3395 #else /* USE_UNICODE_WCHAR_CACHE */
3396         *p = PyUnicode_AsWideCharString(obj, NULL);
3397         if (*p == NULL) {
3398             return 0;
3399         }
3400         return Py_CLEANUP_SUPPORTED;
3401 #endif /* USE_UNICODE_WCHAR_CACHE */
3402     }
3403     PyErr_Format(PyExc_TypeError,
3404                  "argument must be str or None, not %.50s",
3405                  Py_TYPE(obj)->tp_name);
3406     return 0;
3407 }
3408 
3409 PyObject *
PyUnicode_FromOrdinal(int ordinal)3410 PyUnicode_FromOrdinal(int ordinal)
3411 {
3412     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413         PyErr_SetString(PyExc_ValueError,
3414                         "chr() arg not in range(0x110000)");
3415         return NULL;
3416     }
3417 
3418     return unicode_char((Py_UCS4)ordinal);
3419 }
3420 
3421 PyObject *
PyUnicode_FromObject(PyObject * obj)3422 PyUnicode_FromObject(PyObject *obj)
3423 {
3424     /* XXX Perhaps we should make this API an alias of
3425        PyObject_Str() instead ?! */
3426     if (PyUnicode_CheckExact(obj)) {
3427         if (PyUnicode_READY(obj) == -1)
3428             return NULL;
3429         Py_INCREF(obj);
3430         return obj;
3431     }
3432     if (PyUnicode_Check(obj)) {
3433         /* For a Unicode subtype that's not a Unicode object,
3434            return a true Unicode object with the same data. */
3435         return _PyUnicode_Copy(obj);
3436     }
3437     PyErr_Format(PyExc_TypeError,
3438                  "Can't convert '%.100s' object to str implicitly",
3439                  Py_TYPE(obj)->tp_name);
3440     return NULL;
3441 }
3442 
3443 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444 PyUnicode_FromEncodedObject(PyObject *obj,
3445                             const char *encoding,
3446                             const char *errors)
3447 {
3448     Py_buffer buffer;
3449     PyObject *v;
3450 
3451     if (obj == NULL) {
3452         PyErr_BadInternalCall();
3453         return NULL;
3454     }
3455 
3456     /* Decoding bytes objects is the most common case and should be fast */
3457     if (PyBytes_Check(obj)) {
3458         if (PyBytes_GET_SIZE(obj) == 0) {
3459             if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460                 return NULL;
3461             }
3462             _Py_RETURN_UNICODE_EMPTY();
3463         }
3464         return PyUnicode_Decode(
3465                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466                 encoding, errors);
3467     }
3468 
3469     if (PyUnicode_Check(obj)) {
3470         PyErr_SetString(PyExc_TypeError,
3471                         "decoding str is not supported");
3472         return NULL;
3473     }
3474 
3475     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477         PyErr_Format(PyExc_TypeError,
3478                      "decoding to str: need a bytes-like object, %.80s found",
3479                      Py_TYPE(obj)->tp_name);
3480         return NULL;
3481     }
3482 
3483     if (buffer.len == 0) {
3484         PyBuffer_Release(&buffer);
3485         if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486             return NULL;
3487         }
3488         _Py_RETURN_UNICODE_EMPTY();
3489     }
3490 
3491     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492     PyBuffer_Release(&buffer);
3493     return v;
3494 }
3495 
3496 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498    longer than lower_len-1). */
3499 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500 _Py_normalize_encoding(const char *encoding,
3501                        char *lower,
3502                        size_t lower_len)
3503 {
3504     const char *e;
3505     char *l;
3506     char *l_end;
3507     int punct;
3508 
3509     assert(encoding != NULL);
3510 
3511     e = encoding;
3512     l = lower;
3513     l_end = &lower[lower_len - 1];
3514     punct = 0;
3515     while (1) {
3516         char c = *e;
3517         if (c == 0) {
3518             break;
3519         }
3520 
3521         if (Py_ISALNUM(c) || c == '.') {
3522             if (punct && l != lower) {
3523                 if (l == l_end) {
3524                     return 0;
3525                 }
3526                 *l++ = '_';
3527             }
3528             punct = 0;
3529 
3530             if (l == l_end) {
3531                 return 0;
3532             }
3533             *l++ = Py_TOLOWER(c);
3534         }
3535         else {
3536             punct = 1;
3537         }
3538 
3539         e++;
3540     }
3541     *l = '\0';
3542     return 1;
3543 }
3544 
3545 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546 PyUnicode_Decode(const char *s,
3547                  Py_ssize_t size,
3548                  const char *encoding,
3549                  const char *errors)
3550 {
3551     PyObject *buffer = NULL, *unicode;
3552     Py_buffer info;
3553     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554 
3555     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556         return NULL;
3557     }
3558 
3559     if (size == 0) {
3560         _Py_RETURN_UNICODE_EMPTY();
3561     }
3562 
3563     if (encoding == NULL) {
3564         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565     }
3566 
3567     /* Shortcuts for common default encodings */
3568     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569         char *lower = buflower;
3570 
3571         /* Fast paths */
3572         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573             lower += 3;
3574             if (*lower == '_') {
3575                 /* Match "utf8" and "utf_8" */
3576                 lower++;
3577             }
3578 
3579             if (lower[0] == '8' && lower[1] == 0) {
3580                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581             }
3582             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584             }
3585             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587             }
3588         }
3589         else {
3590             if (strcmp(lower, "ascii") == 0
3591                 || strcmp(lower, "us_ascii") == 0) {
3592                 return PyUnicode_DecodeASCII(s, size, errors);
3593             }
3594     #ifdef MS_WINDOWS
3595             else if (strcmp(lower, "mbcs") == 0) {
3596                 return PyUnicode_DecodeMBCS(s, size, errors);
3597             }
3598     #endif
3599             else if (strcmp(lower, "latin1") == 0
3600                      || strcmp(lower, "latin_1") == 0
3601                      || strcmp(lower, "iso_8859_1") == 0
3602                      || strcmp(lower, "iso8859_1") == 0) {
3603                 return PyUnicode_DecodeLatin1(s, size, errors);
3604             }
3605         }
3606     }
3607 
3608     /* Decode via the codec registry */
3609     buffer = NULL;
3610     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611         goto onError;
3612     buffer = PyMemoryView_FromBuffer(&info);
3613     if (buffer == NULL)
3614         goto onError;
3615     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616     if (unicode == NULL)
3617         goto onError;
3618     if (!PyUnicode_Check(unicode)) {
3619         PyErr_Format(PyExc_TypeError,
3620                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621                      "use codecs.decode() to decode to arbitrary types",
3622                      encoding,
3623                      Py_TYPE(unicode)->tp_name);
3624         Py_DECREF(unicode);
3625         goto onError;
3626     }
3627     Py_DECREF(buffer);
3628     return unicode_result(unicode);
3629 
3630   onError:
3631     Py_XDECREF(buffer);
3632     return NULL;
3633 }
3634 
3635 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636 PyUnicode_AsDecodedObject(PyObject *unicode,
3637                           const char *encoding,
3638                           const char *errors)
3639 {
3640     if (!PyUnicode_Check(unicode)) {
3641         PyErr_BadArgument();
3642         return NULL;
3643     }
3644 
3645     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646                      "PyUnicode_AsDecodedObject() is deprecated; "
3647                      "use PyCodec_Decode() to decode from str", 1) < 0)
3648         return NULL;
3649 
3650     if (encoding == NULL)
3651         encoding = PyUnicode_GetDefaultEncoding();
3652 
3653     /* Decode via the codec registry */
3654     return PyCodec_Decode(unicode, encoding, errors);
3655 }
3656 
3657 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659                            const char *encoding,
3660                            const char *errors)
3661 {
3662     PyObject *v;
3663 
3664     if (!PyUnicode_Check(unicode)) {
3665         PyErr_BadArgument();
3666         goto onError;
3667     }
3668 
3669     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3671                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672         return NULL;
3673 
3674     if (encoding == NULL)
3675         encoding = PyUnicode_GetDefaultEncoding();
3676 
3677     /* Decode via the codec registry */
3678     v = PyCodec_Decode(unicode, encoding, errors);
3679     if (v == NULL)
3680         goto onError;
3681     if (!PyUnicode_Check(v)) {
3682         PyErr_Format(PyExc_TypeError,
3683                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684                      "use codecs.decode() to decode to arbitrary types",
3685                      encoding,
3686                      Py_TYPE(unicode)->tp_name);
3687         Py_DECREF(v);
3688         goto onError;
3689     }
3690     return unicode_result(v);
3691 
3692   onError:
3693     return NULL;
3694 }
3695 
3696 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697 PyUnicode_AsEncodedObject(PyObject *unicode,
3698                           const char *encoding,
3699                           const char *errors)
3700 {
3701     PyObject *v;
3702 
3703     if (!PyUnicode_Check(unicode)) {
3704         PyErr_BadArgument();
3705         goto onError;
3706     }
3707 
3708     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709                      "PyUnicode_AsEncodedObject() is deprecated; "
3710                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3712         return NULL;
3713 
3714     if (encoding == NULL)
3715         encoding = PyUnicode_GetDefaultEncoding();
3716 
3717     /* Encode via the codec registry */
3718     v = PyCodec_Encode(unicode, encoding, errors);
3719     if (v == NULL)
3720         goto onError;
3721     return v;
3722 
3723   onError:
3724     return NULL;
3725 }
3726 
3727 
3728 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730                       int current_locale)
3731 {
3732     Py_ssize_t wlen;
3733     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734     if (wstr == NULL) {
3735         return NULL;
3736     }
3737 
3738     if ((size_t)wlen != wcslen(wstr)) {
3739         PyErr_SetString(PyExc_ValueError, "embedded null character");
3740         PyMem_Free(wstr);
3741         return NULL;
3742     }
3743 
3744     char *str;
3745     size_t error_pos;
3746     const char *reason;
3747     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748                                  current_locale, error_handler);
3749     PyMem_Free(wstr);
3750 
3751     if (res != 0) {
3752         if (res == -2) {
3753             PyObject *exc;
3754             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755                     "locale", unicode,
3756                     (Py_ssize_t)error_pos,
3757                     (Py_ssize_t)(error_pos+1),
3758                     reason);
3759             if (exc != NULL) {
3760                 PyCodec_StrictErrors(exc);
3761                 Py_DECREF(exc);
3762             }
3763         }
3764         else if (res == -3) {
3765             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766         }
3767         else {
3768             PyErr_NoMemory();
3769         }
3770         return NULL;
3771     }
3772 
3773     PyObject *bytes = PyBytes_FromString(str);
3774     PyMem_RawFree(str);
3775     return bytes;
3776 }
3777 
3778 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780 {
3781     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782     return unicode_encode_locale(unicode, error_handler, 1);
3783 }
3784 
3785 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3786 PyUnicode_EncodeFSDefault(PyObject *unicode)
3787 {
3788     PyInterpreterState *interp = _PyInterpreterState_GET();
3789     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790     if (fs_codec->utf8) {
3791         return unicode_encode_utf8(unicode,
3792                                    fs_codec->error_handler,
3793                                    fs_codec->errors);
3794     }
3795 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3796     else if (fs_codec->encoding) {
3797         return PyUnicode_AsEncodedString(unicode,
3798                                          fs_codec->encoding,
3799                                          fs_codec->errors);
3800     }
3801 #endif
3802     else {
3803         /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804            machinery is not ready and so cannot be used:
3805            use wcstombs() in this case. */
3806         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807         const wchar_t *filesystem_errors = config->filesystem_errors;
3808         assert(filesystem_errors != NULL);
3809         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810         assert(errors != _Py_ERROR_UNKNOWN);
3811 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3812         return unicode_encode_utf8(unicode, errors, NULL);
3813 #else
3814         return unicode_encode_locale(unicode, errors, 0);
3815 #endif
3816     }
3817 }
3818 
3819 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820 PyUnicode_AsEncodedString(PyObject *unicode,
3821                           const char *encoding,
3822                           const char *errors)
3823 {
3824     PyObject *v;
3825     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826 
3827     if (!PyUnicode_Check(unicode)) {
3828         PyErr_BadArgument();
3829         return NULL;
3830     }
3831 
3832     if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833         return NULL;
3834     }
3835 
3836     if (encoding == NULL) {
3837         return _PyUnicode_AsUTF8String(unicode, errors);
3838     }
3839 
3840     /* Shortcuts for common default encodings */
3841     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842         char *lower = buflower;
3843 
3844         /* Fast paths */
3845         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846             lower += 3;
3847             if (*lower == '_') {
3848                 /* Match "utf8" and "utf_8" */
3849                 lower++;
3850             }
3851 
3852             if (lower[0] == '8' && lower[1] == 0) {
3853                 return _PyUnicode_AsUTF8String(unicode, errors);
3854             }
3855             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857             }
3858             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860             }
3861         }
3862         else {
3863             if (strcmp(lower, "ascii") == 0
3864                 || strcmp(lower, "us_ascii") == 0) {
3865                 return _PyUnicode_AsASCIIString(unicode, errors);
3866             }
3867 #ifdef MS_WINDOWS
3868             else if (strcmp(lower, "mbcs") == 0) {
3869                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870             }
3871 #endif
3872             else if (strcmp(lower, "latin1") == 0 ||
3873                      strcmp(lower, "latin_1") == 0 ||
3874                      strcmp(lower, "iso_8859_1") == 0 ||
3875                      strcmp(lower, "iso8859_1") == 0) {
3876                 return _PyUnicode_AsLatin1String(unicode, errors);
3877             }
3878         }
3879     }
3880 
3881     /* Encode via the codec registry */
3882     v = _PyCodec_EncodeText(unicode, encoding, errors);
3883     if (v == NULL)
3884         return NULL;
3885 
3886     /* The normal path */
3887     if (PyBytes_Check(v))
3888         return v;
3889 
3890     /* If the codec returns a buffer, raise a warning and convert to bytes */
3891     if (PyByteArray_Check(v)) {
3892         int error;
3893         PyObject *b;
3894 
3895         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896             "encoder %s returned bytearray instead of bytes; "
3897             "use codecs.encode() to encode to arbitrary types",
3898             encoding);
3899         if (error) {
3900             Py_DECREF(v);
3901             return NULL;
3902         }
3903 
3904         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905                                       PyByteArray_GET_SIZE(v));
3906         Py_DECREF(v);
3907         return b;
3908     }
3909 
3910     PyErr_Format(PyExc_TypeError,
3911                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912                  "use codecs.encode() to encode to arbitrary types",
3913                  encoding,
3914                  Py_TYPE(v)->tp_name);
3915     Py_DECREF(v);
3916     return NULL;
3917 }
3918 
3919 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921                            const char *encoding,
3922                            const char *errors)
3923 {
3924     PyObject *v;
3925 
3926     if (!PyUnicode_Check(unicode)) {
3927         PyErr_BadArgument();
3928         goto onError;
3929     }
3930 
3931     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3933                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934         return NULL;
3935 
3936     if (encoding == NULL)
3937         encoding = PyUnicode_GetDefaultEncoding();
3938 
3939     /* Encode via the codec registry */
3940     v = PyCodec_Encode(unicode, encoding, errors);
3941     if (v == NULL)
3942         goto onError;
3943     if (!PyUnicode_Check(v)) {
3944         PyErr_Format(PyExc_TypeError,
3945                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946                      "use codecs.encode() to encode to arbitrary types",
3947                      encoding,
3948                      Py_TYPE(v)->tp_name);
3949         Py_DECREF(v);
3950         goto onError;
3951     }
3952     return v;
3953 
3954   onError:
3955     return NULL;
3956 }
3957 
3958 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959 unicode_decode_locale(const char *str, Py_ssize_t len,
3960                       _Py_error_handler errors, int current_locale)
3961 {
3962     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3963         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964         return NULL;
3965     }
3966 
3967     wchar_t *wstr;
3968     size_t wlen;
3969     const char *reason;
3970     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971                                  current_locale, errors);
3972     if (res != 0) {
3973         if (res == -2) {
3974             PyObject *exc;
3975             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976                                         "locale", str, len,
3977                                         (Py_ssize_t)wlen,
3978                                         (Py_ssize_t)(wlen + 1),
3979                                         reason);
3980             if (exc != NULL) {
3981                 PyCodec_StrictErrors(exc);
3982                 Py_DECREF(exc);
3983             }
3984         }
3985         else if (res == -3) {
3986             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987         }
3988         else {
3989             PyErr_NoMemory();
3990         }
3991         return NULL;
3992     }
3993 
3994     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995     PyMem_RawFree(wstr);
3996     return unicode;
3997 }
3998 
3999 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001                               const char *errors)
4002 {
4003     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004     return unicode_decode_locale(str, len, error_handler, 1);
4005 }
4006 
4007 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4008 PyUnicode_DecodeLocale(const char *str, const char *errors)
4009 {
4010     Py_ssize_t size = (Py_ssize_t)strlen(str);
4011     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012     return unicode_decode_locale(str, size, error_handler, 1);
4013 }
4014 
4015 
4016 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4017 PyUnicode_DecodeFSDefault(const char *s) {
4018     Py_ssize_t size = (Py_ssize_t)strlen(s);
4019     return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020 }
4021 
4022 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024 {
4025     PyInterpreterState *interp = _PyInterpreterState_GET();
4026     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027     if (fs_codec->utf8) {
4028         return unicode_decode_utf8(s, size,
4029                                    fs_codec->error_handler,
4030                                    fs_codec->errors,
4031                                    NULL);
4032     }
4033 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4034     else if (fs_codec->encoding) {
4035         return PyUnicode_Decode(s, size,
4036                                 fs_codec->encoding,
4037                                 fs_codec->errors);
4038     }
4039 #endif
4040     else {
4041         /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042            machinery is not ready and so cannot be used:
4043            use mbstowcs() in this case. */
4044         const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045         const wchar_t *filesystem_errors = config->filesystem_errors;
4046         assert(filesystem_errors != NULL);
4047         _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048         assert(errors != _Py_ERROR_UNKNOWN);
4049 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4050         return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051 #else
4052         return unicode_decode_locale(s, size, errors, 0);
4053 #endif
4054     }
4055 }
4056 
4057 
4058 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4059 PyUnicode_FSConverter(PyObject* arg, void* addr)
4060 {
4061     PyObject *path = NULL;
4062     PyObject *output = NULL;
4063     Py_ssize_t size;
4064     const char *data;
4065     if (arg == NULL) {
4066         Py_DECREF(*(PyObject**)addr);
4067         *(PyObject**)addr = NULL;
4068         return 1;
4069     }
4070     path = PyOS_FSPath(arg);
4071     if (path == NULL) {
4072         return 0;
4073     }
4074     if (PyBytes_Check(path)) {
4075         output = path;
4076     }
4077     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4078         output = PyUnicode_EncodeFSDefault(path);
4079         Py_DECREF(path);
4080         if (!output) {
4081             return 0;
4082         }
4083         assert(PyBytes_Check(output));
4084     }
4085 
4086     size = PyBytes_GET_SIZE(output);
4087     data = PyBytes_AS_STRING(output);
4088     if ((size_t)size != strlen(data)) {
4089         PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090         Py_DECREF(output);
4091         return 0;
4092     }
4093     *(PyObject**)addr = output;
4094     return Py_CLEANUP_SUPPORTED;
4095 }
4096 
4097 
4098 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4099 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100 {
4101     int is_buffer = 0;
4102     PyObject *path = NULL;
4103     PyObject *output = NULL;
4104     if (arg == NULL) {
4105         Py_DECREF(*(PyObject**)addr);
4106         *(PyObject**)addr = NULL;
4107         return 1;
4108     }
4109 
4110     is_buffer = PyObject_CheckBuffer(arg);
4111     if (!is_buffer) {
4112         path = PyOS_FSPath(arg);
4113         if (path == NULL) {
4114             return 0;
4115         }
4116     }
4117     else {
4118         path = arg;
4119         Py_INCREF(arg);
4120     }
4121 
4122     if (PyUnicode_Check(path)) {
4123         output = path;
4124     }
4125     else if (PyBytes_Check(path) || is_buffer) {
4126         PyObject *path_bytes = NULL;
4127 
4128         if (!PyBytes_Check(path) &&
4129             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130             "path should be string, bytes, or os.PathLike, not %.200s",
4131             Py_TYPE(arg)->tp_name)) {
4132                 Py_DECREF(path);
4133             return 0;
4134         }
4135         path_bytes = PyBytes_FromObject(path);
4136         Py_DECREF(path);
4137         if (!path_bytes) {
4138             return 0;
4139         }
4140         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141                                                   PyBytes_GET_SIZE(path_bytes));
4142         Py_DECREF(path_bytes);
4143         if (!output) {
4144             return 0;
4145         }
4146     }
4147     else {
4148         PyErr_Format(PyExc_TypeError,
4149                      "path should be string, bytes, or os.PathLike, not %.200s",
4150                      Py_TYPE(arg)->tp_name);
4151         Py_DECREF(path);
4152         return 0;
4153     }
4154     if (PyUnicode_READY(output) == -1) {
4155         Py_DECREF(output);
4156         return 0;
4157     }
4158     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160         PyErr_SetString(PyExc_ValueError, "embedded null character");
4161         Py_DECREF(output);
4162         return 0;
4163     }
4164     *(PyObject**)addr = output;
4165     return Py_CLEANUP_SUPPORTED;
4166 }
4167 
4168 
4169 static int unicode_fill_utf8(PyObject *unicode);
4170 
4171 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173 {
4174     if (!PyUnicode_Check(unicode)) {
4175         PyErr_BadArgument();
4176         return NULL;
4177     }
4178     if (PyUnicode_READY(unicode) == -1)
4179         return NULL;
4180 
4181     if (PyUnicode_UTF8(unicode) == NULL) {
4182         if (unicode_fill_utf8(unicode) == -1) {
4183             return NULL;
4184         }
4185     }
4186 
4187     if (psize)
4188         *psize = PyUnicode_UTF8_LENGTH(unicode);
4189     return PyUnicode_UTF8(unicode);
4190 }
4191 
4192 const char *
PyUnicode_AsUTF8(PyObject * unicode)4193 PyUnicode_AsUTF8(PyObject *unicode)
4194 {
4195     return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196 }
4197 
4198 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200 {
4201     if (!PyUnicode_Check(unicode)) {
4202         PyErr_BadArgument();
4203         return NULL;
4204     }
4205     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206     if (w == NULL) {
4207         /* Non-ASCII compact unicode object */
4208         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209         assert(PyUnicode_IS_READY(unicode));
4210 
4211         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213             PyErr_NoMemory();
4214             return NULL;
4215         }
4216         w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217         if (w == NULL) {
4218             PyErr_NoMemory();
4219             return NULL;
4220         }
4221         unicode_copy_as_widechar(unicode, w, wlen + 1);
4222         _PyUnicode_WSTR(unicode) = w;
4223         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225         }
4226     }
4227     if (size != NULL)
4228         *size = PyUnicode_WSTR_LENGTH(unicode);
4229     return w;
4230 }
4231 
4232 /* Deprecated APIs */
4233 
4234 _Py_COMP_DIAG_PUSH
4235 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236 
4237 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4238 PyUnicode_AsUnicode(PyObject *unicode)
4239 {
4240     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241 }
4242 
4243 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4244 _PyUnicode_AsUnicode(PyObject *unicode)
4245 {
4246     Py_ssize_t size;
4247     const Py_UNICODE *wstr;
4248 
4249     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250     if (wstr && wcslen(wstr) != (size_t)size) {
4251         PyErr_SetString(PyExc_ValueError, "embedded null character");
4252         return NULL;
4253     }
4254     return wstr;
4255 }
4256 
4257 
4258 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4259 PyUnicode_GetSize(PyObject *unicode)
4260 {
4261     if (!PyUnicode_Check(unicode)) {
4262         PyErr_BadArgument();
4263         goto onError;
4264     }
4265     if (_PyUnicode_WSTR(unicode) == NULL) {
4266         if (PyUnicode_AsUnicode(unicode) == NULL)
4267             goto onError;
4268     }
4269     return PyUnicode_WSTR_LENGTH(unicode);
4270 
4271   onError:
4272     return -1;
4273 }
4274 
4275 _Py_COMP_DIAG_POP
4276 
4277 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4278 PyUnicode_GetLength(PyObject *unicode)
4279 {
4280     if (!PyUnicode_Check(unicode)) {
4281         PyErr_BadArgument();
4282         return -1;
4283     }
4284     if (PyUnicode_READY(unicode) == -1)
4285         return -1;
4286     return PyUnicode_GET_LENGTH(unicode);
4287 }
4288 
4289 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291 {
4292     const void *data;
4293     int kind;
4294 
4295     if (!PyUnicode_Check(unicode)) {
4296         PyErr_BadArgument();
4297         return (Py_UCS4)-1;
4298     }
4299     if (PyUnicode_READY(unicode) == -1) {
4300         return (Py_UCS4)-1;
4301     }
4302     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303         PyErr_SetString(PyExc_IndexError, "string index out of range");
4304         return (Py_UCS4)-1;
4305     }
4306     data = PyUnicode_DATA(unicode);
4307     kind = PyUnicode_KIND(unicode);
4308     return PyUnicode_READ(kind, data, index);
4309 }
4310 
4311 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313 {
4314     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315         PyErr_BadArgument();
4316         return -1;
4317     }
4318     assert(PyUnicode_IS_READY(unicode));
4319     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320         PyErr_SetString(PyExc_IndexError, "string index out of range");
4321         return -1;
4322     }
4323     if (unicode_check_modifiable(unicode))
4324         return -1;
4325     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326         PyErr_SetString(PyExc_ValueError, "character out of range");
4327         return -1;
4328     }
4329     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330                     index, ch);
4331     return 0;
4332 }
4333 
4334 const char *
PyUnicode_GetDefaultEncoding(void)4335 PyUnicode_GetDefaultEncoding(void)
4336 {
4337     return "utf-8";
4338 }
4339 
4340 /* create or adjust a UnicodeDecodeError */
4341 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342 make_decode_exception(PyObject **exceptionObject,
4343                       const char *encoding,
4344                       const char *input, Py_ssize_t length,
4345                       Py_ssize_t startpos, Py_ssize_t endpos,
4346                       const char *reason)
4347 {
4348     if (*exceptionObject == NULL) {
4349         *exceptionObject = PyUnicodeDecodeError_Create(
4350             encoding, input, length, startpos, endpos, reason);
4351     }
4352     else {
4353         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354             goto onError;
4355         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356             goto onError;
4357         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358             goto onError;
4359     }
4360     return;
4361 
4362 onError:
4363     Py_CLEAR(*exceptionObject);
4364 }
4365 
4366 #ifdef MS_WINDOWS
4367 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369 {
4370     if (newsize > *size) {
4371         wchar_t *newbuf = *buf;
4372         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373             PyErr_NoMemory();
4374             return -1;
4375         }
4376         *buf = newbuf;
4377     }
4378     *size = newsize;
4379     return 0;
4380 }
4381 
4382 /* error handling callback helper:
4383    build arguments, call the callback and check the arguments,
4384    if no exception occurred, copy the replacement to the output
4385    and adjust various state variables.
4386    return 0 on success, -1 on error
4387 */
4388 
4389 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390 unicode_decode_call_errorhandler_wchar(
4391     const char *errors, PyObject **errorHandler,
4392     const char *encoding, const char *reason,
4393     const char **input, const char **inend, Py_ssize_t *startinpos,
4394     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396 {
4397     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398 
4399     PyObject *restuple = NULL;
4400     PyObject *repunicode = NULL;
4401     Py_ssize_t outsize;
4402     Py_ssize_t insize;
4403     Py_ssize_t requiredsize;
4404     Py_ssize_t newpos;
4405     PyObject *inputobj = NULL;
4406     Py_ssize_t repwlen;
4407 
4408     if (*errorHandler == NULL) {
4409         *errorHandler = PyCodec_LookupError(errors);
4410         if (*errorHandler == NULL)
4411             goto onError;
4412     }
4413 
4414     make_decode_exception(exceptionObject,
4415         encoding,
4416         *input, *inend - *input,
4417         *startinpos, *endinpos,
4418         reason);
4419     if (*exceptionObject == NULL)
4420         goto onError;
4421 
4422     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423     if (restuple == NULL)
4424         goto onError;
4425     if (!PyTuple_Check(restuple)) {
4426         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427         goto onError;
4428     }
4429     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430         goto onError;
4431 
4432     /* Copy back the bytes variables, which might have been modified by the
4433        callback */
4434     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435     if (!inputobj)
4436         goto onError;
4437     *input = PyBytes_AS_STRING(inputobj);
4438     insize = PyBytes_GET_SIZE(inputobj);
4439     *inend = *input + insize;
4440     /* we can DECREF safely, as the exception has another reference,
4441        so the object won't go away. */
4442     Py_DECREF(inputobj);
4443 
4444     if (newpos<0)
4445         newpos = insize+newpos;
4446     if (newpos<0 || newpos>insize) {
4447         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448         goto onError;
4449     }
4450 
4451 #if USE_UNICODE_WCHAR_CACHE
4452 _Py_COMP_DIAG_PUSH
4453 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454     repwlen = PyUnicode_GetSize(repunicode);
4455     if (repwlen < 0)
4456         goto onError;
4457 _Py_COMP_DIAG_POP
4458 #else /* USE_UNICODE_WCHAR_CACHE */
4459     repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460     if (repwlen < 0)
4461         goto onError;
4462     repwlen--;
4463 #endif /* USE_UNICODE_WCHAR_CACHE */
4464     /* need more space? (at least enough for what we
4465        have+the replacement+the rest of the string (starting
4466        at the new input position), so we won't have to check space
4467        when there are no errors in the rest of the string) */
4468     requiredsize = *outpos;
4469     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470         goto overflow;
4471     requiredsize += repwlen;
4472     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473         goto overflow;
4474     requiredsize += insize - newpos;
4475     outsize = *bufsize;
4476     if (requiredsize > outsize) {
4477         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478             requiredsize = 2*outsize;
4479         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480             goto onError;
4481         }
4482     }
4483     PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484     *outpos += repwlen;
4485     *endinpos = newpos;
4486     *inptr = *input + newpos;
4487 
4488     /* we made it! */
4489     Py_DECREF(restuple);
4490     return 0;
4491 
4492   overflow:
4493     PyErr_SetString(PyExc_OverflowError,
4494                     "decoded result is too long for a Python string");
4495 
4496   onError:
4497     Py_XDECREF(restuple);
4498     return -1;
4499 }
4500 #endif   /* MS_WINDOWS */
4501 
4502 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503 unicode_decode_call_errorhandler_writer(
4504     const char *errors, PyObject **errorHandler,
4505     const char *encoding, const char *reason,
4506     const char **input, const char **inend, Py_ssize_t *startinpos,
4507     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509 {
4510     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511 
4512     PyObject *restuple = NULL;
4513     PyObject *repunicode = NULL;
4514     Py_ssize_t insize;
4515     Py_ssize_t newpos;
4516     Py_ssize_t replen;
4517     Py_ssize_t remain;
4518     PyObject *inputobj = NULL;
4519     int need_to_grow = 0;
4520     const char *new_inptr;
4521 
4522     if (*errorHandler == NULL) {
4523         *errorHandler = PyCodec_LookupError(errors);
4524         if (*errorHandler == NULL)
4525             goto onError;
4526     }
4527 
4528     make_decode_exception(exceptionObject,
4529         encoding,
4530         *input, *inend - *input,
4531         *startinpos, *endinpos,
4532         reason);
4533     if (*exceptionObject == NULL)
4534         goto onError;
4535 
4536     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537     if (restuple == NULL)
4538         goto onError;
4539     if (!PyTuple_Check(restuple)) {
4540         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541         goto onError;
4542     }
4543     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544         goto onError;
4545 
4546     /* Copy back the bytes variables, which might have been modified by the
4547        callback */
4548     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549     if (!inputobj)
4550         goto onError;
4551     remain = *inend - *input - *endinpos;
4552     *input = PyBytes_AS_STRING(inputobj);
4553     insize = PyBytes_GET_SIZE(inputobj);
4554     *inend = *input + insize;
4555     /* we can DECREF safely, as the exception has another reference,
4556        so the object won't go away. */
4557     Py_DECREF(inputobj);
4558 
4559     if (newpos<0)
4560         newpos = insize+newpos;
4561     if (newpos<0 || newpos>insize) {
4562         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563         goto onError;
4564     }
4565 
4566     replen = PyUnicode_GET_LENGTH(repunicode);
4567     if (replen > 1) {
4568         writer->min_length += replen - 1;
4569         need_to_grow = 1;
4570     }
4571     new_inptr = *input + newpos;
4572     if (*inend - new_inptr > remain) {
4573         /* We don't know the decoding algorithm here so we make the worst
4574            assumption that one byte decodes to one unicode character.
4575            If unfortunately one byte could decode to more unicode characters,
4576            the decoder may write out-of-bound then.  Is it possible for the
4577            algorithms using this function? */
4578         writer->min_length += *inend - new_inptr - remain;
4579         need_to_grow = 1;
4580     }
4581     if (need_to_grow) {
4582         writer->overallocate = 1;
4583         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585             goto onError;
4586     }
4587     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588         goto onError;
4589 
4590     *endinpos = newpos;
4591     *inptr = new_inptr;
4592 
4593     /* we made it! */
4594     Py_DECREF(restuple);
4595     return 0;
4596 
4597   onError:
4598     Py_XDECREF(restuple);
4599     return -1;
4600 }
4601 
4602 /* --- UTF-7 Codec -------------------------------------------------------- */
4603 
4604 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4605 
4606 /* Three simple macros defining base-64. */
4607 
4608 /* Is c a base-64 character? */
4609 
4610 #define IS_BASE64(c) \
4611     (((c) >= 'A' && (c) <= 'Z') ||     \
4612      ((c) >= 'a' && (c) <= 'z') ||     \
4613      ((c) >= '0' && (c) <= '9') ||     \
4614      (c) == '+' || (c) == '/')
4615 
4616 /* given that c is a base-64 character, what is its base-64 value? */
4617 
4618 #define FROM_BASE64(c)                                                  \
4619     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4620      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4621      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4622      (c) == '+' ? 62 : 63)
4623 
4624 /* What is the base-64 character of the bottom 6 bits of n? */
4625 
4626 #define TO_BASE64(n)  \
4627     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628 
4629 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630  * decoded as itself.  We are permissive on decoding; the only ASCII
4631  * byte not decoding to itself is the + which begins a base64
4632  * string. */
4633 
4634 #define DECODE_DIRECT(c)                                \
4635     ((c) <= 127 && (c) != '+')
4636 
4637 /* The UTF-7 encoder treats ASCII characters differently according to
4638  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639  * the above).  See RFC2152.  This array identifies these different
4640  * sets:
4641  * 0 : "Set D"
4642  *     alphanumeric and '(),-./:?
4643  * 1 : "Set O"
4644  *     !"#$%&*;<=>@[]^_`{|}
4645  * 2 : "whitespace"
4646  *     ht nl cr sp
4647  * 3 : special (must be base64 encoded)
4648  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649  */
4650 
4651 static
4652 char utf7_category[128] = {
4653 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4654     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4655 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4656     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4657 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4658     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4659 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4660     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4661 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4662     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4663 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4664     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4665 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4666     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4667 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4668     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4669 };
4670 
4671 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4672  * answer depends on whether we are encoding set O as itself, and also
4673  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4674  * clear that the answers to these questions vary between
4675  * applications, so this code needs to be flexible.  */
4676 
4677 #define ENCODE_DIRECT(c, directO, directWS)             \
4678     ((c) < 128 && (c) > 0 &&                            \
4679      ((utf7_category[(c)] == 0) ||                      \
4680       (directWS && (utf7_category[(c)] == 2)) ||        \
4681       (directO && (utf7_category[(c)] == 1))))
4682 
4683 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684 PyUnicode_DecodeUTF7(const char *s,
4685                      Py_ssize_t size,
4686                      const char *errors)
4687 {
4688     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689 }
4690 
4691 /* The decoder.  The only state we preserve is our read position,
4692  * i.e. how many characters we have consumed.  So if we end in the
4693  * middle of a shift sequence we have to back off the read position
4694  * and the output to the beginning of the sequence, otherwise we lose
4695  * all the shift state (seen bits, number of bits seen, high
4696  * surrogate). */
4697 
4698 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699 PyUnicode_DecodeUTF7Stateful(const char *s,
4700                              Py_ssize_t size,
4701                              const char *errors,
4702                              Py_ssize_t *consumed)
4703 {
4704     const char *starts = s;
4705     Py_ssize_t startinpos;
4706     Py_ssize_t endinpos;
4707     const char *e;
4708     _PyUnicodeWriter writer;
4709     const char *errmsg = "";
4710     int inShift = 0;
4711     Py_ssize_t shiftOutStart;
4712     unsigned int base64bits = 0;
4713     unsigned long base64buffer = 0;
4714     Py_UCS4 surrogate = 0;
4715     PyObject *errorHandler = NULL;
4716     PyObject *exc = NULL;
4717 
4718     if (size == 0) {
4719         if (consumed)
4720             *consumed = 0;
4721         _Py_RETURN_UNICODE_EMPTY();
4722     }
4723 
4724     /* Start off assuming it's all ASCII. Widen later as necessary. */
4725     _PyUnicodeWriter_Init(&writer);
4726     writer.min_length = size;
4727 
4728     shiftOutStart = 0;
4729     e = s + size;
4730 
4731     while (s < e) {
4732         Py_UCS4 ch;
4733       restart:
4734         ch = (unsigned char) *s;
4735 
4736         if (inShift) { /* in a base-64 section */
4737             if (IS_BASE64(ch)) { /* consume a base-64 character */
4738                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739                 base64bits += 6;
4740                 s++;
4741                 if (base64bits >= 16) {
4742                     /* we have enough bits for a UTF-16 value */
4743                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744                     base64bits -= 16;
4745                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746                     assert(outCh <= 0xffff);
4747                     if (surrogate) {
4748                         /* expecting a second surrogate */
4749                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752                                 goto onError;
4753                             surrogate = 0;
4754                             continue;
4755                         }
4756                         else {
4757                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758                                 goto onError;
4759                             surrogate = 0;
4760                         }
4761                     }
4762                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763                         /* first surrogate */
4764                         surrogate = outCh;
4765                     }
4766                     else {
4767                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768                             goto onError;
4769                     }
4770                 }
4771             }
4772             else { /* now leaving a base-64 section */
4773                 inShift = 0;
4774                 if (base64bits > 0) { /* left-over bits */
4775                     if (base64bits >= 6) {
4776                         /* We've seen at least one base-64 character */
4777                         s++;
4778                         errmsg = "partial character in shift sequence";
4779                         goto utf7Error;
4780                     }
4781                     else {
4782                         /* Some bits remain; they should be zero */
4783                         if (base64buffer != 0) {
4784                             s++;
4785                             errmsg = "non-zero padding bits in shift sequence";
4786                             goto utf7Error;
4787                         }
4788                     }
4789                 }
4790                 if (surrogate && DECODE_DIRECT(ch)) {
4791                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792                         goto onError;
4793                 }
4794                 surrogate = 0;
4795                 if (ch == '-') {
4796                     /* '-' is absorbed; other terminating
4797                        characters are preserved */
4798                     s++;
4799                 }
4800             }
4801         }
4802         else if ( ch == '+' ) {
4803             startinpos = s-starts;
4804             s++; /* consume '+' */
4805             if (s < e && *s == '-') { /* '+-' encodes '+' */
4806                 s++;
4807                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808                     goto onError;
4809             }
4810             else if (s < e && !IS_BASE64(*s)) {
4811                 s++;
4812                 errmsg = "ill-formed sequence";
4813                 goto utf7Error;
4814             }
4815             else { /* begin base64-encoded section */
4816                 inShift = 1;
4817                 surrogate = 0;
4818                 shiftOutStart = writer.pos;
4819                 base64bits = 0;
4820                 base64buffer = 0;
4821             }
4822         }
4823         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824             s++;
4825             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826                 goto onError;
4827         }
4828         else {
4829             startinpos = s-starts;
4830             s++;
4831             errmsg = "unexpected special character";
4832             goto utf7Error;
4833         }
4834         continue;
4835 utf7Error:
4836         endinpos = s-starts;
4837         if (unicode_decode_call_errorhandler_writer(
4838                 errors, &errorHandler,
4839                 "utf7", errmsg,
4840                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4841                 &writer))
4842             goto onError;
4843     }
4844 
4845     /* end of string */
4846 
4847     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848         /* if we're in an inconsistent state, that's an error */
4849         inShift = 0;
4850         if (surrogate ||
4851                 (base64bits >= 6) ||
4852                 (base64bits > 0 && base64buffer != 0)) {
4853             endinpos = size;
4854             if (unicode_decode_call_errorhandler_writer(
4855                     errors, &errorHandler,
4856                     "utf7", "unterminated shift sequence",
4857                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4858                     &writer))
4859                 goto onError;
4860             if (s < e)
4861                 goto restart;
4862         }
4863     }
4864 
4865     /* return state */
4866     if (consumed) {
4867         if (inShift) {
4868             *consumed = startinpos;
4869             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870                 PyObject *result = PyUnicode_FromKindAndData(
4871                         writer.kind, writer.data, shiftOutStart);
4872                 Py_XDECREF(errorHandler);
4873                 Py_XDECREF(exc);
4874                 _PyUnicodeWriter_Dealloc(&writer);
4875                 return result;
4876             }
4877             writer.pos = shiftOutStart; /* back off output */
4878         }
4879         else {
4880             *consumed = s-starts;
4881         }
4882     }
4883 
4884     Py_XDECREF(errorHandler);
4885     Py_XDECREF(exc);
4886     return _PyUnicodeWriter_Finish(&writer);
4887 
4888   onError:
4889     Py_XDECREF(errorHandler);
4890     Py_XDECREF(exc);
4891     _PyUnicodeWriter_Dealloc(&writer);
4892     return NULL;
4893 }
4894 
4895 
4896 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897 _PyUnicode_EncodeUTF7(PyObject *str,
4898                       int base64SetO,
4899                       int base64WhiteSpace,
4900                       const char *errors)
4901 {
4902     int kind;
4903     const void *data;
4904     Py_ssize_t len;
4905     PyObject *v;
4906     int inShift = 0;
4907     Py_ssize_t i;
4908     unsigned int base64bits = 0;
4909     unsigned long base64buffer = 0;
4910     char * out;
4911     const char * start;
4912 
4913     if (PyUnicode_READY(str) == -1)
4914         return NULL;
4915     kind = PyUnicode_KIND(str);
4916     data = PyUnicode_DATA(str);
4917     len = PyUnicode_GET_LENGTH(str);
4918 
4919     if (len == 0)
4920         return PyBytes_FromStringAndSize(NULL, 0);
4921 
4922     /* It might be possible to tighten this worst case */
4923     if (len > PY_SSIZE_T_MAX / 8)
4924         return PyErr_NoMemory();
4925     v = PyBytes_FromStringAndSize(NULL, len * 8);
4926     if (v == NULL)
4927         return NULL;
4928 
4929     start = out = PyBytes_AS_STRING(v);
4930     for (i = 0; i < len; ++i) {
4931         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932 
4933         if (inShift) {
4934             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935                 /* shifting out */
4936                 if (base64bits) { /* output remaining bits */
4937                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938                     base64buffer = 0;
4939                     base64bits = 0;
4940                 }
4941                 inShift = 0;
4942                 /* Characters not in the BASE64 set implicitly unshift the sequence
4943                    so no '-' is required, except if the character is itself a '-' */
4944                 if (IS_BASE64(ch) || ch == '-') {
4945                     *out++ = '-';
4946                 }
4947                 *out++ = (char) ch;
4948             }
4949             else {
4950                 goto encode_char;
4951             }
4952         }
4953         else { /* not in a shift sequence */
4954             if (ch == '+') {
4955                 *out++ = '+';
4956                         *out++ = '-';
4957             }
4958             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959                 *out++ = (char) ch;
4960             }
4961             else {
4962                 *out++ = '+';
4963                 inShift = 1;
4964                 goto encode_char;
4965             }
4966         }
4967         continue;
4968 encode_char:
4969         if (ch >= 0x10000) {
4970             assert(ch <= MAX_UNICODE);
4971 
4972             /* code first surrogate */
4973             base64bits += 16;
4974             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975             while (base64bits >= 6) {
4976                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977                 base64bits -= 6;
4978             }
4979             /* prepare second surrogate */
4980             ch = Py_UNICODE_LOW_SURROGATE(ch);
4981         }
4982         base64bits += 16;
4983         base64buffer = (base64buffer << 16) | ch;
4984         while (base64bits >= 6) {
4985             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986             base64bits -= 6;
4987         }
4988     }
4989     if (base64bits)
4990         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991     if (inShift)
4992         *out++ = '-';
4993     if (_PyBytes_Resize(&v, out - start) < 0)
4994         return NULL;
4995     return v;
4996 }
4997 
4998 #undef IS_BASE64
4999 #undef FROM_BASE64
5000 #undef TO_BASE64
5001 #undef DECODE_DIRECT
5002 #undef ENCODE_DIRECT
5003 
5004 /* --- UTF-8 Codec -------------------------------------------------------- */
5005 
5006 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007 PyUnicode_DecodeUTF8(const char *s,
5008                      Py_ssize_t size,
5009                      const char *errors)
5010 {
5011     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012 }
5013 
5014 #include "stringlib/asciilib.h"
5015 #include "stringlib/codecs.h"
5016 #include "stringlib/undef.h"
5017 
5018 #include "stringlib/ucs1lib.h"
5019 #include "stringlib/codecs.h"
5020 #include "stringlib/undef.h"
5021 
5022 #include "stringlib/ucs2lib.h"
5023 #include "stringlib/codecs.h"
5024 #include "stringlib/undef.h"
5025 
5026 #include "stringlib/ucs4lib.h"
5027 #include "stringlib/codecs.h"
5028 #include "stringlib/undef.h"
5029 
5030 /* Mask to quickly check whether a C 'size_t' contains a
5031    non-ASCII, UTF8-encoded char. */
5032 #if (SIZEOF_SIZE_T == 8)
5033 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5034 #elif (SIZEOF_SIZE_T == 4)
5035 # define ASCII_CHAR_MASK 0x80808080U
5036 #else
5037 # error C 'size_t' size should be either 4 or 8!
5038 #endif
5039 
5040 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042 {
5043     const char *p = start;
5044 
5045 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046     assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047     if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048         /* Fast path, see in STRINGLIB(utf8_decode) for
5049            an explanation. */
5050         /* Help allocation */
5051         const char *_p = p;
5052         Py_UCS1 * q = dest;
5053         while (_p + SIZEOF_SIZE_T <= end) {
5054             size_t value = *(const size_t *) _p;
5055             if (value & ASCII_CHAR_MASK)
5056                 break;
5057             *((size_t *)q) = value;
5058             _p += SIZEOF_SIZE_T;
5059             q += SIZEOF_SIZE_T;
5060         }
5061         p = _p;
5062         while (p < end) {
5063             if ((unsigned char)*p & 0x80)
5064                 break;
5065             *q++ = *p++;
5066         }
5067         return p - start;
5068     }
5069 #endif
5070     while (p < end) {
5071         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072            for an explanation. */
5073         if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074             /* Help allocation */
5075             const char *_p = p;
5076             while (_p + SIZEOF_SIZE_T <= end) {
5077                 size_t value = *(const size_t *) _p;
5078                 if (value & ASCII_CHAR_MASK)
5079                     break;
5080                 _p += SIZEOF_SIZE_T;
5081             }
5082             p = _p;
5083             if (_p == end)
5084                 break;
5085         }
5086         if ((unsigned char)*p & 0x80)
5087             break;
5088         ++p;
5089     }
5090     memcpy(dest, start, p - start);
5091     return p - start;
5092 }
5093 
5094 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095 unicode_decode_utf8(const char *s, Py_ssize_t size,
5096                     _Py_error_handler error_handler, const char *errors,
5097                     Py_ssize_t *consumed)
5098 {
5099     if (size == 0) {
5100         if (consumed)
5101             *consumed = 0;
5102         _Py_RETURN_UNICODE_EMPTY();
5103     }
5104 
5105     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106     if (size == 1 && (unsigned char)s[0] < 128) {
5107         if (consumed) {
5108             *consumed = 1;
5109         }
5110         return get_latin1_char((unsigned char)s[0]);
5111     }
5112 
5113     const char *starts = s;
5114     const char *end = s + size;
5115 
5116     // fast path: try ASCII string.
5117     PyObject *u = PyUnicode_New(size, 127);
5118     if (u == NULL) {
5119         return NULL;
5120     }
5121     s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122     if (s == end) {
5123         return u;
5124     }
5125 
5126     // Use _PyUnicodeWriter after fast path is failed.
5127     _PyUnicodeWriter writer;
5128     _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129     writer.pos = s - starts;
5130 
5131     Py_ssize_t startinpos, endinpos;
5132     const char *errmsg = "";
5133     PyObject *error_handler_obj = NULL;
5134     PyObject *exc = NULL;
5135 
5136     while (s < end) {
5137         Py_UCS4 ch;
5138         int kind = writer.kind;
5139 
5140         if (kind == PyUnicode_1BYTE_KIND) {
5141             if (PyUnicode_IS_ASCII(writer.buffer))
5142                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143             else
5144                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145         } else if (kind == PyUnicode_2BYTE_KIND) {
5146             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147         } else {
5148             assert(kind == PyUnicode_4BYTE_KIND);
5149             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150         }
5151 
5152         switch (ch) {
5153         case 0:
5154             if (s == end || consumed)
5155                 goto End;
5156             errmsg = "unexpected end of data";
5157             startinpos = s - starts;
5158             endinpos = end - starts;
5159             break;
5160         case 1:
5161             errmsg = "invalid start byte";
5162             startinpos = s - starts;
5163             endinpos = startinpos + 1;
5164             break;
5165         case 2:
5166             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168             {
5169                 /* Truncated surrogate code in range D800-DFFF */
5170                 goto End;
5171             }
5172             /* fall through */
5173         case 3:
5174         case 4:
5175             errmsg = "invalid continuation byte";
5176             startinpos = s - starts;
5177             endinpos = startinpos + ch - 1;
5178             break;
5179         default:
5180             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181                 goto onError;
5182             continue;
5183         }
5184 
5185         if (error_handler == _Py_ERROR_UNKNOWN)
5186             error_handler = _Py_GetErrorHandler(errors);
5187 
5188         switch (error_handler) {
5189         case _Py_ERROR_IGNORE:
5190             s += (endinpos - startinpos);
5191             break;
5192 
5193         case _Py_ERROR_REPLACE:
5194             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195                 goto onError;
5196             s += (endinpos - startinpos);
5197             break;
5198 
5199         case _Py_ERROR_SURROGATEESCAPE:
5200         {
5201             Py_ssize_t i;
5202 
5203             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204                 goto onError;
5205             for (i=startinpos; i<endinpos; i++) {
5206                 ch = (Py_UCS4)(unsigned char)(starts[i]);
5207                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208                                 ch + 0xdc00);
5209                 writer.pos++;
5210             }
5211             s += (endinpos - startinpos);
5212             break;
5213         }
5214 
5215         default:
5216             if (unicode_decode_call_errorhandler_writer(
5217                     errors, &error_handler_obj,
5218                     "utf-8", errmsg,
5219                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5220                     &writer))
5221                 goto onError;
5222         }
5223     }
5224 
5225 End:
5226     if (consumed)
5227         *consumed = s - starts;
5228 
5229     Py_XDECREF(error_handler_obj);
5230     Py_XDECREF(exc);
5231     return _PyUnicodeWriter_Finish(&writer);
5232 
5233 onError:
5234     Py_XDECREF(error_handler_obj);
5235     Py_XDECREF(exc);
5236     _PyUnicodeWriter_Dealloc(&writer);
5237     return NULL;
5238 }
5239 
5240 
5241 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242 PyUnicode_DecodeUTF8Stateful(const char *s,
5243                              Py_ssize_t size,
5244                              const char *errors,
5245                              Py_ssize_t *consumed)
5246 {
5247     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248 }
5249 
5250 
5251 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252    non-zero, use strict error handler otherwise.
5253 
5254    On success, write a pointer to a newly allocated wide character string into
5255    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256    (in number of wchar_t units) into *wlen (if wlen is set).
5257 
5258    On memory allocation failure, return -1.
5259 
5260    On decoding error (if surrogateescape is zero), return -2. If wlen is
5261    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262    is not NULL, write the decoding error message into *reason. */
5263 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265                  const char **reason, _Py_error_handler errors)
5266 {
5267     const char *orig_s = s;
5268     const char *e;
5269     wchar_t *unicode;
5270     Py_ssize_t outpos;
5271 
5272     int surrogateescape = 0;
5273     int surrogatepass = 0;
5274     switch (errors)
5275     {
5276     case _Py_ERROR_STRICT:
5277         break;
5278     case _Py_ERROR_SURROGATEESCAPE:
5279         surrogateescape = 1;
5280         break;
5281     case _Py_ERROR_SURROGATEPASS:
5282         surrogatepass = 1;
5283         break;
5284     default:
5285         return -3;
5286     }
5287 
5288     /* Note: size will always be longer than the resulting Unicode
5289        character count */
5290     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291         return -1;
5292     }
5293 
5294     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295     if (!unicode) {
5296         return -1;
5297     }
5298 
5299     /* Unpack UTF-8 encoded data */
5300     e = s + size;
5301     outpos = 0;
5302     while (s < e) {
5303         Py_UCS4 ch;
5304 #if SIZEOF_WCHAR_T == 4
5305         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306 #else
5307         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308 #endif
5309         if (ch > 0xFF) {
5310 #if SIZEOF_WCHAR_T == 4
5311             Py_UNREACHABLE();
5312 #else
5313             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314             /* write a surrogate pair */
5315             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317 #endif
5318         }
5319         else {
5320             if (!ch && s == e) {
5321                 break;
5322             }
5323 
5324             if (surrogateescape) {
5325                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326             }
5327             else {
5328                 /* Is it a valid three-byte code? */
5329                 if (surrogatepass
5330                     && (e - s) >= 3
5331                     && (s[0] & 0xf0) == 0xe0
5332                     && (s[1] & 0xc0) == 0x80
5333                     && (s[2] & 0xc0) == 0x80)
5334                 {
5335                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336                     s += 3;
5337                     unicode[outpos++] = ch;
5338                 }
5339                 else {
5340                     PyMem_RawFree(unicode );
5341                     if (reason != NULL) {
5342                         switch (ch) {
5343                         case 0:
5344                             *reason = "unexpected end of data";
5345                             break;
5346                         case 1:
5347                             *reason = "invalid start byte";
5348                             break;
5349                         /* 2, 3, 4 */
5350                         default:
5351                             *reason = "invalid continuation byte";
5352                             break;
5353                         }
5354                     }
5355                     if (wlen != NULL) {
5356                         *wlen = s - orig_s;
5357                     }
5358                     return -2;
5359                 }
5360             }
5361         }
5362     }
5363     unicode[outpos] = L'\0';
5364     if (wlen) {
5365         *wlen = outpos;
5366     }
5367     *wstr = unicode;
5368     return 0;
5369 }
5370 
5371 
5372 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374                                size_t *wlen)
5375 {
5376     wchar_t *wstr;
5377     int res = _Py_DecodeUTF8Ex(arg, arglen,
5378                                &wstr, wlen,
5379                                NULL, _Py_ERROR_SURROGATEESCAPE);
5380     if (res != 0) {
5381         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382         assert(res != -3);
5383         if (wlen) {
5384             *wlen = (size_t)res;
5385         }
5386         return NULL;
5387     }
5388     return wstr;
5389 }
5390 
5391 
5392 /* UTF-8 encoder using the surrogateescape error handler .
5393 
5394    On success, return 0 and write the newly allocated character string (use
5395    PyMem_Free() to free the memory) into *str.
5396 
5397    On encoding failure, return -2 and write the position of the invalid
5398    surrogate character into *error_pos (if error_pos is set) and the decoding
5399    error message into *reason (if reason is set).
5400 
5401    On memory allocation failure, return -1. */
5402 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404                  const char **reason, int raw_malloc, _Py_error_handler errors)
5405 {
5406     const Py_ssize_t max_char_size = 4;
5407     Py_ssize_t len = wcslen(text);
5408 
5409     assert(len >= 0);
5410 
5411     int surrogateescape = 0;
5412     int surrogatepass = 0;
5413     switch (errors)
5414     {
5415     case _Py_ERROR_STRICT:
5416         break;
5417     case _Py_ERROR_SURROGATEESCAPE:
5418         surrogateescape = 1;
5419         break;
5420     case _Py_ERROR_SURROGATEPASS:
5421         surrogatepass = 1;
5422         break;
5423     default:
5424         return -3;
5425     }
5426 
5427     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428         return -1;
5429     }
5430     char *bytes;
5431     if (raw_malloc) {
5432         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433     }
5434     else {
5435         bytes = PyMem_Malloc((len + 1) * max_char_size);
5436     }
5437     if (bytes == NULL) {
5438         return -1;
5439     }
5440 
5441     char *p = bytes;
5442     Py_ssize_t i;
5443     for (i = 0; i < len; ) {
5444         Py_ssize_t ch_pos = i;
5445         Py_UCS4 ch = text[i];
5446         i++;
5447 #if Py_UNICODE_SIZE == 2
5448         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449             && i < len
5450             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451         {
5452             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453             i++;
5454         }
5455 #endif
5456 
5457         if (ch < 0x80) {
5458             /* Encode ASCII */
5459             *p++ = (char) ch;
5460 
5461         }
5462         else if (ch < 0x0800) {
5463             /* Encode Latin-1 */
5464             *p++ = (char)(0xc0 | (ch >> 6));
5465             *p++ = (char)(0x80 | (ch & 0x3f));
5466         }
5467         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468             /* surrogateescape error handler */
5469             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470                 if (error_pos != NULL) {
5471                     *error_pos = (size_t)ch_pos;
5472                 }
5473                 if (reason != NULL) {
5474                     *reason = "encoding error";
5475                 }
5476                 if (raw_malloc) {
5477                     PyMem_RawFree(bytes);
5478                 }
5479                 else {
5480                     PyMem_Free(bytes);
5481                 }
5482                 return -2;
5483             }
5484             *p++ = (char)(ch & 0xff);
5485         }
5486         else if (ch < 0x10000) {
5487             *p++ = (char)(0xe0 | (ch >> 12));
5488             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489             *p++ = (char)(0x80 | (ch & 0x3f));
5490         }
5491         else {  /* ch >= 0x10000 */
5492             assert(ch <= MAX_UNICODE);
5493             /* Encode UCS4 Unicode ordinals */
5494             *p++ = (char)(0xf0 | (ch >> 18));
5495             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497             *p++ = (char)(0x80 | (ch & 0x3f));
5498         }
5499     }
5500     *p++ = '\0';
5501 
5502     size_t final_size = (p - bytes);
5503     char *bytes2;
5504     if (raw_malloc) {
5505         bytes2 = PyMem_RawRealloc(bytes, final_size);
5506     }
5507     else {
5508         bytes2 = PyMem_Realloc(bytes, final_size);
5509     }
5510     if (bytes2 == NULL) {
5511         if (error_pos != NULL) {
5512             *error_pos = (size_t)-1;
5513         }
5514         if (raw_malloc) {
5515             PyMem_RawFree(bytes);
5516         }
5517         else {
5518             PyMem_Free(bytes);
5519         }
5520         return -1;
5521     }
5522     *str = bytes2;
5523     return 0;
5524 }
5525 
5526 
5527 /* Primary internal function which creates utf8 encoded bytes objects.
5528 
5529    Allocation strategy:  if the string is short, convert into a stack buffer
5530    and allocate exactly as much space needed at the end.  Else allocate the
5531    maximum possible needed (4 result bytes per Unicode character), and return
5532    the excess memory at the end.
5533 */
5534 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536                     const char *errors)
5537 {
5538     if (!PyUnicode_Check(unicode)) {
5539         PyErr_BadArgument();
5540         return NULL;
5541     }
5542 
5543     if (PyUnicode_READY(unicode) == -1)
5544         return NULL;
5545 
5546     if (PyUnicode_UTF8(unicode))
5547         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548                                          PyUnicode_UTF8_LENGTH(unicode));
5549 
5550     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551     const void *data = PyUnicode_DATA(unicode);
5552     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553 
5554     _PyBytesWriter writer;
5555     char *end;
5556 
5557     switch (kind) {
5558     default:
5559         Py_UNREACHABLE();
5560     case PyUnicode_1BYTE_KIND:
5561         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562         assert(!PyUnicode_IS_ASCII(unicode));
5563         end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564         break;
5565     case PyUnicode_2BYTE_KIND:
5566         end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567         break;
5568     case PyUnicode_4BYTE_KIND:
5569         end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570         break;
5571     }
5572 
5573     if (end == NULL) {
5574         _PyBytesWriter_Dealloc(&writer);
5575         return NULL;
5576     }
5577     return _PyBytesWriter_Finish(&writer, end);
5578 }
5579 
5580 static int
unicode_fill_utf8(PyObject * unicode)5581 unicode_fill_utf8(PyObject *unicode)
5582 {
5583     /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584     assert(!PyUnicode_IS_ASCII(unicode));
5585 
5586     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587     const void *data = PyUnicode_DATA(unicode);
5588     Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589 
5590     _PyBytesWriter writer;
5591     char *end;
5592 
5593     switch (kind) {
5594     default:
5595         Py_UNREACHABLE();
5596     case PyUnicode_1BYTE_KIND:
5597         end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598                                    _Py_ERROR_STRICT, NULL);
5599         break;
5600     case PyUnicode_2BYTE_KIND:
5601         end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602                                    _Py_ERROR_STRICT, NULL);
5603         break;
5604     case PyUnicode_4BYTE_KIND:
5605         end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606                                    _Py_ERROR_STRICT, NULL);
5607         break;
5608     }
5609     if (end == NULL) {
5610         _PyBytesWriter_Dealloc(&writer);
5611         return -1;
5612     }
5613 
5614     const char *start = writer.use_small_buffer ? writer.small_buffer :
5615                     PyBytes_AS_STRING(writer.buffer);
5616     Py_ssize_t len = end - start;
5617 
5618     char *cache = PyObject_Malloc(len + 1);
5619     if (cache == NULL) {
5620         _PyBytesWriter_Dealloc(&writer);
5621         PyErr_NoMemory();
5622         return -1;
5623     }
5624     _PyUnicode_UTF8(unicode) = cache;
5625     _PyUnicode_UTF8_LENGTH(unicode) = len;
5626     memcpy(cache, start, len);
5627     cache[len] = '\0';
5628     _PyBytesWriter_Dealloc(&writer);
5629     return 0;
5630 }
5631 
5632 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634 {
5635     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636 }
5637 
5638 
5639 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5640 PyUnicode_AsUTF8String(PyObject *unicode)
5641 {
5642     return _PyUnicode_AsUTF8String(unicode, NULL);
5643 }
5644 
5645 /* --- UTF-32 Codec ------------------------------------------------------- */
5646 
5647 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648 PyUnicode_DecodeUTF32(const char *s,
5649                       Py_ssize_t size,
5650                       const char *errors,
5651                       int *byteorder)
5652 {
5653     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654 }
5655 
5656 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657 PyUnicode_DecodeUTF32Stateful(const char *s,
5658                               Py_ssize_t size,
5659                               const char *errors,
5660                               int *byteorder,
5661                               Py_ssize_t *consumed)
5662 {
5663     const char *starts = s;
5664     Py_ssize_t startinpos;
5665     Py_ssize_t endinpos;
5666     _PyUnicodeWriter writer;
5667     const unsigned char *q, *e;
5668     int le, bo = 0;       /* assume native ordering by default */
5669     const char *encoding;
5670     const char *errmsg = "";
5671     PyObject *errorHandler = NULL;
5672     PyObject *exc = NULL;
5673 
5674     q = (const unsigned char *)s;
5675     e = q + size;
5676 
5677     if (byteorder)
5678         bo = *byteorder;
5679 
5680     /* Check for BOM marks (U+FEFF) in the input and adjust current
5681        byte order setting accordingly. In native mode, the leading BOM
5682        mark is skipped, in all other modes, it is copied to the output
5683        stream as-is (giving a ZWNBSP character). */
5684     if (bo == 0 && size >= 4) {
5685         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686         if (bom == 0x0000FEFF) {
5687             bo = -1;
5688             q += 4;
5689         }
5690         else if (bom == 0xFFFE0000) {
5691             bo = 1;
5692             q += 4;
5693         }
5694         if (byteorder)
5695             *byteorder = bo;
5696     }
5697 
5698     if (q == e) {
5699         if (consumed)
5700             *consumed = size;
5701         _Py_RETURN_UNICODE_EMPTY();
5702     }
5703 
5704 #ifdef WORDS_BIGENDIAN
5705     le = bo < 0;
5706 #else
5707     le = bo <= 0;
5708 #endif
5709     encoding = le ? "utf-32-le" : "utf-32-be";
5710 
5711     _PyUnicodeWriter_Init(&writer);
5712     writer.min_length = (e - q + 3) / 4;
5713     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714         goto onError;
5715 
5716     while (1) {
5717         Py_UCS4 ch = 0;
5718         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719 
5720         if (e - q >= 4) {
5721             enum PyUnicode_Kind kind = writer.kind;
5722             void *data = writer.data;
5723             const unsigned char *last = e - 4;
5724             Py_ssize_t pos = writer.pos;
5725             if (le) {
5726                 do {
5727                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728                     if (ch > maxch)
5729                         break;
5730                     if (kind != PyUnicode_1BYTE_KIND &&
5731                         Py_UNICODE_IS_SURROGATE(ch))
5732                         break;
5733                     PyUnicode_WRITE(kind, data, pos++, ch);
5734                     q += 4;
5735                 } while (q <= last);
5736             }
5737             else {
5738                 do {
5739                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740                     if (ch > maxch)
5741                         break;
5742                     if (kind != PyUnicode_1BYTE_KIND &&
5743                         Py_UNICODE_IS_SURROGATE(ch))
5744                         break;
5745                     PyUnicode_WRITE(kind, data, pos++, ch);
5746                     q += 4;
5747                 } while (q <= last);
5748             }
5749             writer.pos = pos;
5750         }
5751 
5752         if (Py_UNICODE_IS_SURROGATE(ch)) {
5753             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754             startinpos = ((const char *)q) - starts;
5755             endinpos = startinpos + 4;
5756         }
5757         else if (ch <= maxch) {
5758             if (q == e || consumed)
5759                 break;
5760             /* remaining bytes at the end? (size should be divisible by 4) */
5761             errmsg = "truncated data";
5762             startinpos = ((const char *)q) - starts;
5763             endinpos = ((const char *)e) - starts;
5764         }
5765         else {
5766             if (ch < 0x110000) {
5767                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768                     goto onError;
5769                 q += 4;
5770                 continue;
5771             }
5772             errmsg = "code point not in range(0x110000)";
5773             startinpos = ((const char *)q) - starts;
5774             endinpos = startinpos + 4;
5775         }
5776 
5777         /* The remaining input chars are ignored if the callback
5778            chooses to skip the input */
5779         if (unicode_decode_call_errorhandler_writer(
5780                 errors, &errorHandler,
5781                 encoding, errmsg,
5782                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783                 &writer))
5784             goto onError;
5785     }
5786 
5787     if (consumed)
5788         *consumed = (const char *)q-starts;
5789 
5790     Py_XDECREF(errorHandler);
5791     Py_XDECREF(exc);
5792     return _PyUnicodeWriter_Finish(&writer);
5793 
5794   onError:
5795     _PyUnicodeWriter_Dealloc(&writer);
5796     Py_XDECREF(errorHandler);
5797     Py_XDECREF(exc);
5798     return NULL;
5799 }
5800 
5801 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802 _PyUnicode_EncodeUTF32(PyObject *str,
5803                        const char *errors,
5804                        int byteorder)
5805 {
5806     enum PyUnicode_Kind kind;
5807     const void *data;
5808     Py_ssize_t len;
5809     PyObject *v;
5810     uint32_t *out;
5811 #if PY_LITTLE_ENDIAN
5812     int native_ordering = byteorder <= 0;
5813 #else
5814     int native_ordering = byteorder >= 0;
5815 #endif
5816     const char *encoding;
5817     Py_ssize_t nsize, pos;
5818     PyObject *errorHandler = NULL;
5819     PyObject *exc = NULL;
5820     PyObject *rep = NULL;
5821 
5822     if (!PyUnicode_Check(str)) {
5823         PyErr_BadArgument();
5824         return NULL;
5825     }
5826     if (PyUnicode_READY(str) == -1)
5827         return NULL;
5828     kind = PyUnicode_KIND(str);
5829     data = PyUnicode_DATA(str);
5830     len = PyUnicode_GET_LENGTH(str);
5831 
5832     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833         return PyErr_NoMemory();
5834     nsize = len + (byteorder == 0);
5835     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836     if (v == NULL)
5837         return NULL;
5838 
5839     /* output buffer is 4-bytes aligned */
5840     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841     out = (uint32_t *)PyBytes_AS_STRING(v);
5842     if (byteorder == 0)
5843         *out++ = 0xFEFF;
5844     if (len == 0)
5845         goto done;
5846 
5847     if (byteorder == -1)
5848         encoding = "utf-32-le";
5849     else if (byteorder == 1)
5850         encoding = "utf-32-be";
5851     else
5852         encoding = "utf-32";
5853 
5854     if (kind == PyUnicode_1BYTE_KIND) {
5855         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856         goto done;
5857     }
5858 
5859     pos = 0;
5860     while (pos < len) {
5861         Py_ssize_t newpos, repsize, moreunits;
5862 
5863         if (kind == PyUnicode_2BYTE_KIND) {
5864             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865                                         &out, native_ordering);
5866         }
5867         else {
5868             assert(kind == PyUnicode_4BYTE_KIND);
5869             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870                                         &out, native_ordering);
5871         }
5872         if (pos == len)
5873             break;
5874 
5875         rep = unicode_encode_call_errorhandler(
5876                 errors, &errorHandler,
5877                 encoding, "surrogates not allowed",
5878                 str, &exc, pos, pos + 1, &newpos);
5879         if (!rep)
5880             goto error;
5881 
5882         if (PyBytes_Check(rep)) {
5883             repsize = PyBytes_GET_SIZE(rep);
5884             if (repsize & 3) {
5885                 raise_encode_exception(&exc, encoding,
5886                                        str, pos, pos + 1,
5887                                        "surrogates not allowed");
5888                 goto error;
5889             }
5890             moreunits = repsize / 4;
5891         }
5892         else {
5893             assert(PyUnicode_Check(rep));
5894             if (PyUnicode_READY(rep) < 0)
5895                 goto error;
5896             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897             if (!PyUnicode_IS_ASCII(rep)) {
5898                 raise_encode_exception(&exc, encoding,
5899                                        str, pos, pos + 1,
5900                                        "surrogates not allowed");
5901                 goto error;
5902             }
5903         }
5904         moreunits += pos - newpos;
5905         pos = newpos;
5906 
5907         /* four bytes are reserved for each surrogate */
5908         if (moreunits > 0) {
5909             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911                 /* integer overflow */
5912                 PyErr_NoMemory();
5913                 goto error;
5914             }
5915             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916                 goto error;
5917             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918         }
5919 
5920         if (PyBytes_Check(rep)) {
5921             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922             out += repsize / 4;
5923         } else /* rep is unicode */ {
5924             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926                                  &out, native_ordering);
5927         }
5928 
5929         Py_CLEAR(rep);
5930     }
5931 
5932     /* Cut back to size actually needed. This is necessary for, for example,
5933        encoding of a string containing isolated surrogates and the 'ignore'
5934        handler is used. */
5935     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936     if (nsize != PyBytes_GET_SIZE(v))
5937       _PyBytes_Resize(&v, nsize);
5938     Py_XDECREF(errorHandler);
5939     Py_XDECREF(exc);
5940   done:
5941     return v;
5942   error:
5943     Py_XDECREF(rep);
5944     Py_XDECREF(errorHandler);
5945     Py_XDECREF(exc);
5946     Py_XDECREF(v);
5947     return NULL;
5948 }
5949 
5950 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5951 PyUnicode_AsUTF32String(PyObject *unicode)
5952 {
5953     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954 }
5955 
5956 /* --- UTF-16 Codec ------------------------------------------------------- */
5957 
5958 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959 PyUnicode_DecodeUTF16(const char *s,
5960                       Py_ssize_t size,
5961                       const char *errors,
5962                       int *byteorder)
5963 {
5964     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965 }
5966 
5967 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968 PyUnicode_DecodeUTF16Stateful(const char *s,
5969                               Py_ssize_t size,
5970                               const char *errors,
5971                               int *byteorder,
5972                               Py_ssize_t *consumed)
5973 {
5974     const char *starts = s;
5975     Py_ssize_t startinpos;
5976     Py_ssize_t endinpos;
5977     _PyUnicodeWriter writer;
5978     const unsigned char *q, *e;
5979     int bo = 0;       /* assume native ordering by default */
5980     int native_ordering;
5981     const char *errmsg = "";
5982     PyObject *errorHandler = NULL;
5983     PyObject *exc = NULL;
5984     const char *encoding;
5985 
5986     q = (const unsigned char *)s;
5987     e = q + size;
5988 
5989     if (byteorder)
5990         bo = *byteorder;
5991 
5992     /* Check for BOM marks (U+FEFF) in the input and adjust current
5993        byte order setting accordingly. In native mode, the leading BOM
5994        mark is skipped, in all other modes, it is copied to the output
5995        stream as-is (giving a ZWNBSP character). */
5996     if (bo == 0 && size >= 2) {
5997         const Py_UCS4 bom = (q[1] << 8) | q[0];
5998         if (bom == 0xFEFF) {
5999             q += 2;
6000             bo = -1;
6001         }
6002         else if (bom == 0xFFFE) {
6003             q += 2;
6004             bo = 1;
6005         }
6006         if (byteorder)
6007             *byteorder = bo;
6008     }
6009 
6010     if (q == e) {
6011         if (consumed)
6012             *consumed = size;
6013         _Py_RETURN_UNICODE_EMPTY();
6014     }
6015 
6016 #if PY_LITTLE_ENDIAN
6017     native_ordering = bo <= 0;
6018     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019 #else
6020     native_ordering = bo >= 0;
6021     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022 #endif
6023 
6024     /* Note: size will always be longer than the resulting Unicode
6025        character count normally.  Error handler will take care of
6026        resizing when needed. */
6027     _PyUnicodeWriter_Init(&writer);
6028     writer.min_length = (e - q + 1) / 2;
6029     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030         goto onError;
6031 
6032     while (1) {
6033         Py_UCS4 ch = 0;
6034         if (e - q >= 2) {
6035             int kind = writer.kind;
6036             if (kind == PyUnicode_1BYTE_KIND) {
6037                 if (PyUnicode_IS_ASCII(writer.buffer))
6038                     ch = asciilib_utf16_decode(&q, e,
6039                             (Py_UCS1*)writer.data, &writer.pos,
6040                             native_ordering);
6041                 else
6042                     ch = ucs1lib_utf16_decode(&q, e,
6043                             (Py_UCS1*)writer.data, &writer.pos,
6044                             native_ordering);
6045             } else if (kind == PyUnicode_2BYTE_KIND) {
6046                 ch = ucs2lib_utf16_decode(&q, e,
6047                         (Py_UCS2*)writer.data, &writer.pos,
6048                         native_ordering);
6049             } else {
6050                 assert(kind == PyUnicode_4BYTE_KIND);
6051                 ch = ucs4lib_utf16_decode(&q, e,
6052                         (Py_UCS4*)writer.data, &writer.pos,
6053                         native_ordering);
6054             }
6055         }
6056 
6057         switch (ch)
6058         {
6059         case 0:
6060             /* remaining byte at the end? (size should be even) */
6061             if (q == e || consumed)
6062                 goto End;
6063             errmsg = "truncated data";
6064             startinpos = ((const char *)q) - starts;
6065             endinpos = ((const char *)e) - starts;
6066             break;
6067             /* The remaining input chars are ignored if the callback
6068                chooses to skip the input */
6069         case 1:
6070             q -= 2;
6071             if (consumed)
6072                 goto End;
6073             errmsg = "unexpected end of data";
6074             startinpos = ((const char *)q) - starts;
6075             endinpos = ((const char *)e) - starts;
6076             break;
6077         case 2:
6078             errmsg = "illegal encoding";
6079             startinpos = ((const char *)q) - 2 - starts;
6080             endinpos = startinpos + 2;
6081             break;
6082         case 3:
6083             errmsg = "illegal UTF-16 surrogate";
6084             startinpos = ((const char *)q) - 4 - starts;
6085             endinpos = startinpos + 2;
6086             break;
6087         default:
6088             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089                 goto onError;
6090             continue;
6091         }
6092 
6093         if (unicode_decode_call_errorhandler_writer(
6094                 errors,
6095                 &errorHandler,
6096                 encoding, errmsg,
6097                 &starts,
6098                 (const char **)&e,
6099                 &startinpos,
6100                 &endinpos,
6101                 &exc,
6102                 (const char **)&q,
6103                 &writer))
6104             goto onError;
6105     }
6106 
6107 End:
6108     if (consumed)
6109         *consumed = (const char *)q-starts;
6110 
6111     Py_XDECREF(errorHandler);
6112     Py_XDECREF(exc);
6113     return _PyUnicodeWriter_Finish(&writer);
6114 
6115   onError:
6116     _PyUnicodeWriter_Dealloc(&writer);
6117     Py_XDECREF(errorHandler);
6118     Py_XDECREF(exc);
6119     return NULL;
6120 }
6121 
6122 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123 _PyUnicode_EncodeUTF16(PyObject *str,
6124                        const char *errors,
6125                        int byteorder)
6126 {
6127     enum PyUnicode_Kind kind;
6128     const void *data;
6129     Py_ssize_t len;
6130     PyObject *v;
6131     unsigned short *out;
6132     Py_ssize_t pairs;
6133 #if PY_BIG_ENDIAN
6134     int native_ordering = byteorder >= 0;
6135 #else
6136     int native_ordering = byteorder <= 0;
6137 #endif
6138     const char *encoding;
6139     Py_ssize_t nsize, pos;
6140     PyObject *errorHandler = NULL;
6141     PyObject *exc = NULL;
6142     PyObject *rep = NULL;
6143 
6144     if (!PyUnicode_Check(str)) {
6145         PyErr_BadArgument();
6146         return NULL;
6147     }
6148     if (PyUnicode_READY(str) == -1)
6149         return NULL;
6150     kind = PyUnicode_KIND(str);
6151     data = PyUnicode_DATA(str);
6152     len = PyUnicode_GET_LENGTH(str);
6153 
6154     pairs = 0;
6155     if (kind == PyUnicode_4BYTE_KIND) {
6156         const Py_UCS4 *in = (const Py_UCS4 *)data;
6157         const Py_UCS4 *end = in + len;
6158         while (in < end) {
6159             if (*in++ >= 0x10000) {
6160                 pairs++;
6161             }
6162         }
6163     }
6164     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165         return PyErr_NoMemory();
6166     }
6167     nsize = len + pairs + (byteorder == 0);
6168     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169     if (v == NULL) {
6170         return NULL;
6171     }
6172 
6173     /* output buffer is 2-bytes aligned */
6174     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175     out = (unsigned short *)PyBytes_AS_STRING(v);
6176     if (byteorder == 0) {
6177         *out++ = 0xFEFF;
6178     }
6179     if (len == 0) {
6180         goto done;
6181     }
6182 
6183     if (kind == PyUnicode_1BYTE_KIND) {
6184         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185         goto done;
6186     }
6187 
6188     if (byteorder < 0) {
6189         encoding = "utf-16-le";
6190     }
6191     else if (byteorder > 0) {
6192         encoding = "utf-16-be";
6193     }
6194     else {
6195         encoding = "utf-16";
6196     }
6197 
6198     pos = 0;
6199     while (pos < len) {
6200         Py_ssize_t newpos, repsize, moreunits;
6201 
6202         if (kind == PyUnicode_2BYTE_KIND) {
6203             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204                                         &out, native_ordering);
6205         }
6206         else {
6207             assert(kind == PyUnicode_4BYTE_KIND);
6208             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209                                         &out, native_ordering);
6210         }
6211         if (pos == len)
6212             break;
6213 
6214         rep = unicode_encode_call_errorhandler(
6215                 errors, &errorHandler,
6216                 encoding, "surrogates not allowed",
6217                 str, &exc, pos, pos + 1, &newpos);
6218         if (!rep)
6219             goto error;
6220 
6221         if (PyBytes_Check(rep)) {
6222             repsize = PyBytes_GET_SIZE(rep);
6223             if (repsize & 1) {
6224                 raise_encode_exception(&exc, encoding,
6225                                        str, pos, pos + 1,
6226                                        "surrogates not allowed");
6227                 goto error;
6228             }
6229             moreunits = repsize / 2;
6230         }
6231         else {
6232             assert(PyUnicode_Check(rep));
6233             if (PyUnicode_READY(rep) < 0)
6234                 goto error;
6235             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236             if (!PyUnicode_IS_ASCII(rep)) {
6237                 raise_encode_exception(&exc, encoding,
6238                                        str, pos, pos + 1,
6239                                        "surrogates not allowed");
6240                 goto error;
6241             }
6242         }
6243         moreunits += pos - newpos;
6244         pos = newpos;
6245 
6246         /* two bytes are reserved for each surrogate */
6247         if (moreunits > 0) {
6248             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250                 /* integer overflow */
6251                 PyErr_NoMemory();
6252                 goto error;
6253             }
6254             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255                 goto error;
6256             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257         }
6258 
6259         if (PyBytes_Check(rep)) {
6260             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261             out += repsize / 2;
6262         } else /* rep is unicode */ {
6263             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265                                  &out, native_ordering);
6266         }
6267 
6268         Py_CLEAR(rep);
6269     }
6270 
6271     /* Cut back to size actually needed. This is necessary for, for example,
6272     encoding of a string containing isolated surrogates and the 'ignore' handler
6273     is used. */
6274     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275     if (nsize != PyBytes_GET_SIZE(v))
6276       _PyBytes_Resize(&v, nsize);
6277     Py_XDECREF(errorHandler);
6278     Py_XDECREF(exc);
6279   done:
6280     return v;
6281   error:
6282     Py_XDECREF(rep);
6283     Py_XDECREF(errorHandler);
6284     Py_XDECREF(exc);
6285     Py_XDECREF(v);
6286     return NULL;
6287 #undef STORECHAR
6288 }
6289 
6290 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6291 PyUnicode_AsUTF16String(PyObject *unicode)
6292 {
6293     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294 }
6295 
6296 /* --- Unicode Escape Codec ----------------------------------------------- */
6297 
6298 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299 
6300 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6301 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6302                                Py_ssize_t size,
6303                                const char *errors,
6304                                Py_ssize_t *consumed,
6305                                const char **first_invalid_escape)
6306 {
6307     const char *starts = s;
6308     _PyUnicodeWriter writer;
6309     const char *end;
6310     PyObject *errorHandler = NULL;
6311     PyObject *exc = NULL;
6312 
6313     // so we can remember if we've seen an invalid escape char or not
6314     *first_invalid_escape = NULL;
6315 
6316     if (size == 0) {
6317         if (consumed) {
6318             *consumed = 0;
6319         }
6320         _Py_RETURN_UNICODE_EMPTY();
6321     }
6322     /* Escaped strings will always be longer than the resulting
6323        Unicode string, so we start with size here and then reduce the
6324        length after conversion to the true value.
6325        (but if the error callback returns a long replacement string
6326        we'll have to allocate more space) */
6327     _PyUnicodeWriter_Init(&writer);
6328     writer.min_length = size;
6329     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6330         goto onError;
6331     }
6332 
6333     end = s + size;
6334     while (s < end) {
6335         unsigned char c = (unsigned char) *s++;
6336         Py_UCS4 ch;
6337         int count;
6338         const char *message;
6339 
6340 #define WRITE_ASCII_CHAR(ch)                                                  \
6341             do {                                                              \
6342                 assert(ch <= 127);                                            \
6343                 assert(writer.pos < writer.size);                             \
6344                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6345             } while(0)
6346 
6347 #define WRITE_CHAR(ch)                                                        \
6348             do {                                                              \
6349                 if (ch <= writer.maxchar) {                                   \
6350                     assert(writer.pos < writer.size);                         \
6351                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6352                 }                                                             \
6353                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6354                     goto onError;                                             \
6355                 }                                                             \
6356             } while(0)
6357 
6358         /* Non-escape characters are interpreted as Unicode ordinals */
6359         if (c != '\\') {
6360             WRITE_CHAR(c);
6361             continue;
6362         }
6363 
6364         Py_ssize_t startinpos = s - starts - 1;
6365         /* \ - Escapes */
6366         if (s >= end) {
6367             message = "\\ at end of string";
6368             goto incomplete;
6369         }
6370         c = (unsigned char) *s++;
6371 
6372         assert(writer.pos < writer.size);
6373         switch (c) {
6374 
6375             /* \x escapes */
6376         case '\n': continue;
6377         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6378         case '\'': WRITE_ASCII_CHAR('\''); continue;
6379         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6380         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6381         /* FF */
6382         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6383         case 't': WRITE_ASCII_CHAR('\t'); continue;
6384         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6385         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6386         /* VT */
6387         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6388         /* BEL, not classic C */
6389         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6390 
6391             /* \OOO (octal) escapes */
6392         case '0': case '1': case '2': case '3':
6393         case '4': case '5': case '6': case '7':
6394             ch = c - '0';
6395             if (s < end && '0' <= *s && *s <= '7') {
6396                 ch = (ch<<3) + *s++ - '0';
6397                 if (s < end && '0' <= *s && *s <= '7') {
6398                     ch = (ch<<3) + *s++ - '0';
6399                 }
6400             }
6401             if (ch > 0377) {
6402                 if (*first_invalid_escape == NULL) {
6403                     *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6404                                                     already incremented s. */
6405                 }
6406             }
6407             WRITE_CHAR(ch);
6408             continue;
6409 
6410             /* hex escapes */
6411             /* \xXX */
6412         case 'x':
6413             count = 2;
6414             message = "truncated \\xXX escape";
6415             goto hexescape;
6416 
6417             /* \uXXXX */
6418         case 'u':
6419             count = 4;
6420             message = "truncated \\uXXXX escape";
6421             goto hexescape;
6422 
6423             /* \UXXXXXXXX */
6424         case 'U':
6425             count = 8;
6426             message = "truncated \\UXXXXXXXX escape";
6427         hexescape:
6428             for (ch = 0; count; ++s, --count) {
6429                 if (s >= end) {
6430                     goto incomplete;
6431                 }
6432                 c = (unsigned char)*s;
6433                 ch <<= 4;
6434                 if (c >= '0' && c <= '9') {
6435                     ch += c - '0';
6436                 }
6437                 else if (c >= 'a' && c <= 'f') {
6438                     ch += c - ('a' - 10);
6439                 }
6440                 else if (c >= 'A' && c <= 'F') {
6441                     ch += c - ('A' - 10);
6442                 }
6443                 else {
6444                     goto error;
6445                 }
6446             }
6447 
6448             /* when we get here, ch is a 32-bit unicode character */
6449             if (ch > MAX_UNICODE) {
6450                 message = "illegal Unicode character";
6451                 goto error;
6452             }
6453 
6454             WRITE_CHAR(ch);
6455             continue;
6456 
6457             /* \N{name} */
6458         case 'N':
6459             if (ucnhash_capi == NULL) {
6460                 /* load the unicode data module */
6461                 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6462                                                 PyUnicodeData_CAPSULE_NAME, 1);
6463                 if (ucnhash_capi == NULL) {
6464                     PyErr_SetString(
6465                         PyExc_UnicodeError,
6466                         "\\N escapes not supported (can't load unicodedata module)"
6467                         );
6468                     goto onError;
6469                 }
6470             }
6471 
6472             message = "malformed \\N character escape";
6473             if (s >= end) {
6474                 goto incomplete;
6475             }
6476             if (*s == '{') {
6477                 const char *start = ++s;
6478                 size_t namelen;
6479                 /* look for the closing brace */
6480                 while (s < end && *s != '}')
6481                     s++;
6482                 if (s >= end) {
6483                     goto incomplete;
6484                 }
6485                 namelen = s - start;
6486                 if (namelen) {
6487                     /* found a name.  look it up in the unicode database */
6488                     s++;
6489                     ch = 0xffffffff; /* in case 'getcode' messes up */
6490                     if (namelen <= INT_MAX &&
6491                         ucnhash_capi->getcode(start, (int)namelen,
6492                                               &ch, 0)) {
6493                         assert(ch <= MAX_UNICODE);
6494                         WRITE_CHAR(ch);
6495                         continue;
6496                     }
6497                     message = "unknown Unicode character name";
6498                 }
6499             }
6500             goto error;
6501 
6502         default:
6503             if (*first_invalid_escape == NULL) {
6504                 *first_invalid_escape = s-1; /* Back up one char, since we've
6505                                                 already incremented s. */
6506             }
6507             WRITE_ASCII_CHAR('\\');
6508             WRITE_CHAR(c);
6509             continue;
6510         }
6511 
6512       incomplete:
6513         if (consumed) {
6514             *consumed = startinpos;
6515             break;
6516         }
6517       error:;
6518         Py_ssize_t endinpos = s-starts;
6519         writer.min_length = end - s + writer.pos;
6520         if (unicode_decode_call_errorhandler_writer(
6521                 errors, &errorHandler,
6522                 "unicodeescape", message,
6523                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6524                 &writer)) {
6525             goto onError;
6526         }
6527         assert(end - s <= writer.size - writer.pos);
6528 
6529 #undef WRITE_ASCII_CHAR
6530 #undef WRITE_CHAR
6531     }
6532 
6533     Py_XDECREF(errorHandler);
6534     Py_XDECREF(exc);
6535     return _PyUnicodeWriter_Finish(&writer);
6536 
6537   onError:
6538     _PyUnicodeWriter_Dealloc(&writer);
6539     Py_XDECREF(errorHandler);
6540     Py_XDECREF(exc);
6541     return NULL;
6542 }
6543 
6544 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6545 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6546                               Py_ssize_t size,
6547                               const char *errors,
6548                               Py_ssize_t *consumed)
6549 {
6550     const char *first_invalid_escape;
6551     PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6552                                                       consumed,
6553                                                       &first_invalid_escape);
6554     if (result == NULL)
6555         return NULL;
6556     if (first_invalid_escape != NULL) {
6557         unsigned char c = *first_invalid_escape;
6558         if ('4' <= c && c <= '7') {
6559             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6560                                  "invalid octal escape sequence '\\%.3s'",
6561                                  first_invalid_escape) < 0)
6562             {
6563                 Py_DECREF(result);
6564                 return NULL;
6565             }
6566         }
6567         else {
6568             if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6569                                  "invalid escape sequence '\\%c'",
6570                                  c) < 0)
6571             {
6572                 Py_DECREF(result);
6573                 return NULL;
6574             }
6575         }
6576     }
6577     return result;
6578 }
6579 
6580 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6581 PyUnicode_DecodeUnicodeEscape(const char *s,
6582                               Py_ssize_t size,
6583                               const char *errors)
6584 {
6585     return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6586 }
6587 
6588 /* Return a Unicode-Escape string version of the Unicode object. */
6589 
6590 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6591 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6592 {
6593     Py_ssize_t i, len;
6594     PyObject *repr;
6595     char *p;
6596     enum PyUnicode_Kind kind;
6597     const void *data;
6598     Py_ssize_t expandsize;
6599 
6600     /* Initial allocation is based on the longest-possible character
6601        escape.
6602 
6603        For UCS1 strings it's '\xxx', 4 bytes per source character.
6604        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6605        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6606     */
6607 
6608     if (!PyUnicode_Check(unicode)) {
6609         PyErr_BadArgument();
6610         return NULL;
6611     }
6612     if (PyUnicode_READY(unicode) == -1) {
6613         return NULL;
6614     }
6615 
6616     len = PyUnicode_GET_LENGTH(unicode);
6617     if (len == 0) {
6618         return PyBytes_FromStringAndSize(NULL, 0);
6619     }
6620 
6621     kind = PyUnicode_KIND(unicode);
6622     data = PyUnicode_DATA(unicode);
6623     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6624        bytes, and 1 byte characters 4. */
6625     expandsize = kind * 2 + 2;
6626     if (len > PY_SSIZE_T_MAX / expandsize) {
6627         return PyErr_NoMemory();
6628     }
6629     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6630     if (repr == NULL) {
6631         return NULL;
6632     }
6633 
6634     p = PyBytes_AS_STRING(repr);
6635     for (i = 0; i < len; i++) {
6636         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6637 
6638         /* U+0000-U+00ff range */
6639         if (ch < 0x100) {
6640             if (ch >= ' ' && ch < 127) {
6641                 if (ch != '\\') {
6642                     /* Copy printable US ASCII as-is */
6643                     *p++ = (char) ch;
6644                 }
6645                 /* Escape backslashes */
6646                 else {
6647                     *p++ = '\\';
6648                     *p++ = '\\';
6649                 }
6650             }
6651 
6652             /* Map special whitespace to '\t', \n', '\r' */
6653             else if (ch == '\t') {
6654                 *p++ = '\\';
6655                 *p++ = 't';
6656             }
6657             else if (ch == '\n') {
6658                 *p++ = '\\';
6659                 *p++ = 'n';
6660             }
6661             else if (ch == '\r') {
6662                 *p++ = '\\';
6663                 *p++ = 'r';
6664             }
6665 
6666             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6667             else {
6668                 *p++ = '\\';
6669                 *p++ = 'x';
6670                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6671                 *p++ = Py_hexdigits[ch & 0x000F];
6672             }
6673         }
6674         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6675         else if (ch < 0x10000) {
6676             *p++ = '\\';
6677             *p++ = 'u';
6678             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6679             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6680             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6681             *p++ = Py_hexdigits[ch & 0x000F];
6682         }
6683         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6684         else {
6685 
6686             /* Make sure that the first two digits are zero */
6687             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688             *p++ = '\\';
6689             *p++ = 'U';
6690             *p++ = '0';
6691             *p++ = '0';
6692             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6693             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6694             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6695             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6696             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6697             *p++ = Py_hexdigits[ch & 0x0000000F];
6698         }
6699     }
6700 
6701     assert(p - PyBytes_AS_STRING(repr) > 0);
6702     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703         return NULL;
6704     }
6705     return repr;
6706 }
6707 
6708 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6709 
6710 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6711 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6712                                           Py_ssize_t size,
6713                                           const char *errors,
6714                                           Py_ssize_t *consumed)
6715 {
6716     const char *starts = s;
6717     _PyUnicodeWriter writer;
6718     const char *end;
6719     PyObject *errorHandler = NULL;
6720     PyObject *exc = NULL;
6721 
6722     if (size == 0) {
6723         if (consumed) {
6724             *consumed = 0;
6725         }
6726         _Py_RETURN_UNICODE_EMPTY();
6727     }
6728 
6729     /* Escaped strings will always be longer than the resulting
6730        Unicode string, so we start with size here and then reduce the
6731        length after conversion to the true value. (But decoding error
6732        handler might have to resize the string) */
6733     _PyUnicodeWriter_Init(&writer);
6734     writer.min_length = size;
6735     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6736         goto onError;
6737     }
6738 
6739     end = s + size;
6740     while (s < end) {
6741         unsigned char c = (unsigned char) *s++;
6742         Py_UCS4 ch;
6743         int count;
6744         const char *message;
6745 
6746 #define WRITE_CHAR(ch)                                                        \
6747             do {                                                              \
6748                 if (ch <= writer.maxchar) {                                   \
6749                     assert(writer.pos < writer.size);                         \
6750                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6751                 }                                                             \
6752                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6753                     goto onError;                                             \
6754                 }                                                             \
6755             } while(0)
6756 
6757         /* Non-escape characters are interpreted as Unicode ordinals */
6758         if (c != '\\' || (s >= end && !consumed)) {
6759             WRITE_CHAR(c);
6760             continue;
6761         }
6762 
6763         Py_ssize_t startinpos = s - starts - 1;
6764         /* \ - Escapes */
6765         if (s >= end) {
6766             assert(consumed);
6767             // Set message to silent compiler warning.
6768             // Actually it is never used.
6769             message = "\\ at end of string";
6770             goto incomplete;
6771         }
6772 
6773         c = (unsigned char) *s++;
6774         if (c == 'u') {
6775             count = 4;
6776             message = "truncated \\uXXXX escape";
6777         }
6778         else if (c == 'U') {
6779             count = 8;
6780             message = "truncated \\UXXXXXXXX escape";
6781         }
6782         else {
6783             assert(writer.pos < writer.size);
6784             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6785             WRITE_CHAR(c);
6786             continue;
6787         }
6788 
6789         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6790         for (ch = 0; count; ++s, --count) {
6791             if (s >= end) {
6792                 goto incomplete;
6793             }
6794             c = (unsigned char)*s;
6795             ch <<= 4;
6796             if (c >= '0' && c <= '9') {
6797                 ch += c - '0';
6798             }
6799             else if (c >= 'a' && c <= 'f') {
6800                 ch += c - ('a' - 10);
6801             }
6802             else if (c >= 'A' && c <= 'F') {
6803                 ch += c - ('A' - 10);
6804             }
6805             else {
6806                 goto error;
6807             }
6808         }
6809         if (ch > MAX_UNICODE) {
6810             message = "\\Uxxxxxxxx out of range";
6811             goto error;
6812         }
6813         WRITE_CHAR(ch);
6814         continue;
6815 
6816       incomplete:
6817         if (consumed) {
6818             *consumed = startinpos;
6819             break;
6820         }
6821       error:;
6822         Py_ssize_t endinpos = s-starts;
6823         writer.min_length = end - s + writer.pos;
6824         if (unicode_decode_call_errorhandler_writer(
6825                 errors, &errorHandler,
6826                 "rawunicodeescape", message,
6827                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6828                 &writer)) {
6829             goto onError;
6830         }
6831         assert(end - s <= writer.size - writer.pos);
6832 
6833 #undef WRITE_CHAR
6834     }
6835     Py_XDECREF(errorHandler);
6836     Py_XDECREF(exc);
6837     return _PyUnicodeWriter_Finish(&writer);
6838 
6839   onError:
6840     _PyUnicodeWriter_Dealloc(&writer);
6841     Py_XDECREF(errorHandler);
6842     Py_XDECREF(exc);
6843     return NULL;
6844 }
6845 
6846 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6847 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6848                                  Py_ssize_t size,
6849                                  const char *errors)
6850 {
6851     return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6852 }
6853 
6854 
6855 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6856 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6857 {
6858     PyObject *repr;
6859     char *p;
6860     Py_ssize_t expandsize, pos;
6861     int kind;
6862     const void *data;
6863     Py_ssize_t len;
6864 
6865     if (!PyUnicode_Check(unicode)) {
6866         PyErr_BadArgument();
6867         return NULL;
6868     }
6869     if (PyUnicode_READY(unicode) == -1) {
6870         return NULL;
6871     }
6872     kind = PyUnicode_KIND(unicode);
6873     data = PyUnicode_DATA(unicode);
6874     len = PyUnicode_GET_LENGTH(unicode);
6875     if (kind == PyUnicode_1BYTE_KIND) {
6876         return PyBytes_FromStringAndSize(data, len);
6877     }
6878 
6879     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6880        bytes, and 1 byte characters 4. */
6881     expandsize = kind * 2 + 2;
6882 
6883     if (len > PY_SSIZE_T_MAX / expandsize) {
6884         return PyErr_NoMemory();
6885     }
6886     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6887     if (repr == NULL) {
6888         return NULL;
6889     }
6890     if (len == 0) {
6891         return repr;
6892     }
6893 
6894     p = PyBytes_AS_STRING(repr);
6895     for (pos = 0; pos < len; pos++) {
6896         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6897 
6898         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6899         if (ch < 0x100) {
6900             *p++ = (char) ch;
6901         }
6902         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6903         else if (ch < 0x10000) {
6904             *p++ = '\\';
6905             *p++ = 'u';
6906             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909             *p++ = Py_hexdigits[ch & 15];
6910         }
6911         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6912         else {
6913             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6914             *p++ = '\\';
6915             *p++ = 'U';
6916             *p++ = '0';
6917             *p++ = '0';
6918             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6919             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6920             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6921             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6922             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6923             *p++ = Py_hexdigits[ch & 15];
6924         }
6925     }
6926 
6927     assert(p > PyBytes_AS_STRING(repr));
6928     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6929         return NULL;
6930     }
6931     return repr;
6932 }
6933 
6934 /* --- Latin-1 Codec ------------------------------------------------------ */
6935 
6936 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6937 PyUnicode_DecodeLatin1(const char *s,
6938                        Py_ssize_t size,
6939                        const char *errors)
6940 {
6941     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6942     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6943 }
6944 
6945 /* create or adjust a UnicodeEncodeError */
6946 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6947 make_encode_exception(PyObject **exceptionObject,
6948                       const char *encoding,
6949                       PyObject *unicode,
6950                       Py_ssize_t startpos, Py_ssize_t endpos,
6951                       const char *reason)
6952 {
6953     if (*exceptionObject == NULL) {
6954         *exceptionObject = PyObject_CallFunction(
6955             PyExc_UnicodeEncodeError, "sOnns",
6956             encoding, unicode, startpos, endpos, reason);
6957     }
6958     else {
6959         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6960             goto onError;
6961         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6962             goto onError;
6963         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6964             goto onError;
6965         return;
6966       onError:
6967         Py_CLEAR(*exceptionObject);
6968     }
6969 }
6970 
6971 /* raises a UnicodeEncodeError */
6972 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6973 raise_encode_exception(PyObject **exceptionObject,
6974                        const char *encoding,
6975                        PyObject *unicode,
6976                        Py_ssize_t startpos, Py_ssize_t endpos,
6977                        const char *reason)
6978 {
6979     make_encode_exception(exceptionObject,
6980                           encoding, unicode, startpos, endpos, reason);
6981     if (*exceptionObject != NULL)
6982         PyCodec_StrictErrors(*exceptionObject);
6983 }
6984 
6985 /* error handling callback helper:
6986    build arguments, call the callback and check the arguments,
6987    put the result into newpos and return the replacement string, which
6988    has to be freed by the caller */
6989 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6990 unicode_encode_call_errorhandler(const char *errors,
6991                                  PyObject **errorHandler,
6992                                  const char *encoding, const char *reason,
6993                                  PyObject *unicode, PyObject **exceptionObject,
6994                                  Py_ssize_t startpos, Py_ssize_t endpos,
6995                                  Py_ssize_t *newpos)
6996 {
6997     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6998     Py_ssize_t len;
6999     PyObject *restuple;
7000     PyObject *resunicode;
7001 
7002     if (*errorHandler == NULL) {
7003         *errorHandler = PyCodec_LookupError(errors);
7004         if (*errorHandler == NULL)
7005             return NULL;
7006     }
7007 
7008     if (PyUnicode_READY(unicode) == -1)
7009         return NULL;
7010     len = PyUnicode_GET_LENGTH(unicode);
7011 
7012     make_encode_exception(exceptionObject,
7013                           encoding, unicode, startpos, endpos, reason);
7014     if (*exceptionObject == NULL)
7015         return NULL;
7016 
7017     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7018     if (restuple == NULL)
7019         return NULL;
7020     if (!PyTuple_Check(restuple)) {
7021         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7022         Py_DECREF(restuple);
7023         return NULL;
7024     }
7025     if (!PyArg_ParseTuple(restuple, argparse,
7026                           &resunicode, newpos)) {
7027         Py_DECREF(restuple);
7028         return NULL;
7029     }
7030     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7031         PyErr_SetString(PyExc_TypeError, &argparse[3]);
7032         Py_DECREF(restuple);
7033         return NULL;
7034     }
7035     if (*newpos<0)
7036         *newpos = len + *newpos;
7037     if (*newpos<0 || *newpos>len) {
7038         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7039         Py_DECREF(restuple);
7040         return NULL;
7041     }
7042     Py_INCREF(resunicode);
7043     Py_DECREF(restuple);
7044     return resunicode;
7045 }
7046 
7047 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7048 unicode_encode_ucs1(PyObject *unicode,
7049                     const char *errors,
7050                     const Py_UCS4 limit)
7051 {
7052     /* input state */
7053     Py_ssize_t pos=0, size;
7054     int kind;
7055     const void *data;
7056     /* pointer into the output */
7057     char *str;
7058     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7059     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7060     PyObject *error_handler_obj = NULL;
7061     PyObject *exc = NULL;
7062     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7063     PyObject *rep = NULL;
7064     /* output object */
7065     _PyBytesWriter writer;
7066 
7067     if (PyUnicode_READY(unicode) == -1)
7068         return NULL;
7069     size = PyUnicode_GET_LENGTH(unicode);
7070     kind = PyUnicode_KIND(unicode);
7071     data = PyUnicode_DATA(unicode);
7072     /* allocate enough for a simple encoding without
7073        replacements, if we need more, we'll resize */
7074     if (size == 0)
7075         return PyBytes_FromStringAndSize(NULL, 0);
7076 
7077     _PyBytesWriter_Init(&writer);
7078     str = _PyBytesWriter_Alloc(&writer, size);
7079     if (str == NULL)
7080         return NULL;
7081 
7082     while (pos < size) {
7083         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7084 
7085         /* can we encode this? */
7086         if (ch < limit) {
7087             /* no overflow check, because we know that the space is enough */
7088             *str++ = (char)ch;
7089             ++pos;
7090         }
7091         else {
7092             Py_ssize_t newpos, i;
7093             /* startpos for collecting unencodable chars */
7094             Py_ssize_t collstart = pos;
7095             Py_ssize_t collend = collstart + 1;
7096             /* find all unecodable characters */
7097 
7098             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7099                 ++collend;
7100 
7101             /* Only overallocate the buffer if it's not the last write */
7102             writer.overallocate = (collend < size);
7103 
7104             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7105             if (error_handler == _Py_ERROR_UNKNOWN)
7106                 error_handler = _Py_GetErrorHandler(errors);
7107 
7108             switch (error_handler) {
7109             case _Py_ERROR_STRICT:
7110                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7111                 goto onError;
7112 
7113             case _Py_ERROR_REPLACE:
7114                 memset(str, '?', collend - collstart);
7115                 str += (collend - collstart);
7116                 /* fall through */
7117             case _Py_ERROR_IGNORE:
7118                 pos = collend;
7119                 break;
7120 
7121             case _Py_ERROR_BACKSLASHREPLACE:
7122                 /* subtract preallocated bytes */
7123                 writer.min_size -= (collend - collstart);
7124                 str = backslashreplace(&writer, str,
7125                                        unicode, collstart, collend);
7126                 if (str == NULL)
7127                     goto onError;
7128                 pos = collend;
7129                 break;
7130 
7131             case _Py_ERROR_XMLCHARREFREPLACE:
7132                 /* subtract preallocated bytes */
7133                 writer.min_size -= (collend - collstart);
7134                 str = xmlcharrefreplace(&writer, str,
7135                                         unicode, collstart, collend);
7136                 if (str == NULL)
7137                     goto onError;
7138                 pos = collend;
7139                 break;
7140 
7141             case _Py_ERROR_SURROGATEESCAPE:
7142                 for (i = collstart; i < collend; ++i) {
7143                     ch = PyUnicode_READ(kind, data, i);
7144                     if (ch < 0xdc80 || 0xdcff < ch) {
7145                         /* Not a UTF-8b surrogate */
7146                         break;
7147                     }
7148                     *str++ = (char)(ch - 0xdc00);
7149                     ++pos;
7150                 }
7151                 if (i >= collend)
7152                     break;
7153                 collstart = pos;
7154                 assert(collstart != collend);
7155                 /* fall through */
7156 
7157             default:
7158                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7159                                                        encoding, reason, unicode, &exc,
7160                                                        collstart, collend, &newpos);
7161                 if (rep == NULL)
7162                     goto onError;
7163 
7164                 if (newpos < collstart) {
7165                     writer.overallocate = 1;
7166                     str = _PyBytesWriter_Prepare(&writer, str,
7167                                                  collstart - newpos);
7168                     if (str == NULL)
7169                         goto onError;
7170                 }
7171                 else {
7172                     /* subtract preallocated bytes */
7173                     writer.min_size -= newpos - collstart;
7174                     /* Only overallocate the buffer if it's not the last write */
7175                     writer.overallocate = (newpos < size);
7176                 }
7177 
7178                 if (PyBytes_Check(rep)) {
7179                     /* Directly copy bytes result to output. */
7180                     str = _PyBytesWriter_WriteBytes(&writer, str,
7181                                                     PyBytes_AS_STRING(rep),
7182                                                     PyBytes_GET_SIZE(rep));
7183                 }
7184                 else {
7185                     assert(PyUnicode_Check(rep));
7186 
7187                     if (PyUnicode_READY(rep) < 0)
7188                         goto onError;
7189 
7190                     if (limit == 256 ?
7191                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7192                         !PyUnicode_IS_ASCII(rep))
7193                     {
7194                         /* Not all characters are smaller than limit */
7195                         raise_encode_exception(&exc, encoding, unicode,
7196                                                collstart, collend, reason);
7197                         goto onError;
7198                     }
7199                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7200                     str = _PyBytesWriter_WriteBytes(&writer, str,
7201                                                     PyUnicode_DATA(rep),
7202                                                     PyUnicode_GET_LENGTH(rep));
7203                 }
7204                 if (str == NULL)
7205                     goto onError;
7206 
7207                 pos = newpos;
7208                 Py_CLEAR(rep);
7209             }
7210 
7211             /* If overallocation was disabled, ensure that it was the last
7212                write. Otherwise, we missed an optimization */
7213             assert(writer.overallocate || pos == size);
7214         }
7215     }
7216 
7217     Py_XDECREF(error_handler_obj);
7218     Py_XDECREF(exc);
7219     return _PyBytesWriter_Finish(&writer, str);
7220 
7221   onError:
7222     Py_XDECREF(rep);
7223     _PyBytesWriter_Dealloc(&writer);
7224     Py_XDECREF(error_handler_obj);
7225     Py_XDECREF(exc);
7226     return NULL;
7227 }
7228 
7229 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7230 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7231 {
7232     if (!PyUnicode_Check(unicode)) {
7233         PyErr_BadArgument();
7234         return NULL;
7235     }
7236     if (PyUnicode_READY(unicode) == -1)
7237         return NULL;
7238     /* Fast path: if it is a one-byte string, construct
7239        bytes object directly. */
7240     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7241         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7242                                          PyUnicode_GET_LENGTH(unicode));
7243     /* Non-Latin-1 characters present. Defer to above function to
7244        raise the exception. */
7245     return unicode_encode_ucs1(unicode, errors, 256);
7246 }
7247 
7248 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7249 PyUnicode_AsLatin1String(PyObject *unicode)
7250 {
7251     return _PyUnicode_AsLatin1String(unicode, NULL);
7252 }
7253 
7254 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7255 
7256 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7257 PyUnicode_DecodeASCII(const char *s,
7258                       Py_ssize_t size,
7259                       const char *errors)
7260 {
7261     const char *starts = s;
7262     const char *e = s + size;
7263     PyObject *error_handler_obj = NULL;
7264     PyObject *exc = NULL;
7265     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7266 
7267     if (size == 0)
7268         _Py_RETURN_UNICODE_EMPTY();
7269 
7270     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7271     if (size == 1 && (unsigned char)s[0] < 128) {
7272         return get_latin1_char((unsigned char)s[0]);
7273     }
7274 
7275     // Shortcut for simple case
7276     PyObject *u = PyUnicode_New(size, 127);
7277     if (u == NULL) {
7278         return NULL;
7279     }
7280     Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7281     if (outpos == size) {
7282         return u;
7283     }
7284 
7285     _PyUnicodeWriter writer;
7286     _PyUnicodeWriter_InitWithBuffer(&writer, u);
7287     writer.pos = outpos;
7288 
7289     s += outpos;
7290     int kind = writer.kind;
7291     void *data = writer.data;
7292     Py_ssize_t startinpos, endinpos;
7293 
7294     while (s < e) {
7295         unsigned char c = (unsigned char)*s;
7296         if (c < 128) {
7297             PyUnicode_WRITE(kind, data, writer.pos, c);
7298             writer.pos++;
7299             ++s;
7300             continue;
7301         }
7302 
7303         /* byte outsize range 0x00..0x7f: call the error handler */
7304 
7305         if (error_handler == _Py_ERROR_UNKNOWN)
7306             error_handler = _Py_GetErrorHandler(errors);
7307 
7308         switch (error_handler)
7309         {
7310         case _Py_ERROR_REPLACE:
7311         case _Py_ERROR_SURROGATEESCAPE:
7312             /* Fast-path: the error handler only writes one character,
7313                but we may switch to UCS2 at the first write */
7314             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7315                 goto onError;
7316             kind = writer.kind;
7317             data = writer.data;
7318 
7319             if (error_handler == _Py_ERROR_REPLACE)
7320                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7321             else
7322                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7323             writer.pos++;
7324             ++s;
7325             break;
7326 
7327         case _Py_ERROR_IGNORE:
7328             ++s;
7329             break;
7330 
7331         default:
7332             startinpos = s-starts;
7333             endinpos = startinpos + 1;
7334             if (unicode_decode_call_errorhandler_writer(
7335                     errors, &error_handler_obj,
7336                     "ascii", "ordinal not in range(128)",
7337                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7338                     &writer))
7339                 goto onError;
7340             kind = writer.kind;
7341             data = writer.data;
7342         }
7343     }
7344     Py_XDECREF(error_handler_obj);
7345     Py_XDECREF(exc);
7346     return _PyUnicodeWriter_Finish(&writer);
7347 
7348   onError:
7349     _PyUnicodeWriter_Dealloc(&writer);
7350     Py_XDECREF(error_handler_obj);
7351     Py_XDECREF(exc);
7352     return NULL;
7353 }
7354 
7355 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7356 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7357 {
7358     if (!PyUnicode_Check(unicode)) {
7359         PyErr_BadArgument();
7360         return NULL;
7361     }
7362     if (PyUnicode_READY(unicode) == -1)
7363         return NULL;
7364     /* Fast path: if it is an ASCII-only string, construct bytes object
7365        directly. Else defer to above function to raise the exception. */
7366     if (PyUnicode_IS_ASCII(unicode))
7367         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7368                                          PyUnicode_GET_LENGTH(unicode));
7369     return unicode_encode_ucs1(unicode, errors, 128);
7370 }
7371 
7372 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7373 PyUnicode_AsASCIIString(PyObject *unicode)
7374 {
7375     return _PyUnicode_AsASCIIString(unicode, NULL);
7376 }
7377 
7378 #ifdef MS_WINDOWS
7379 
7380 /* --- MBCS codecs for Windows -------------------------------------------- */
7381 
7382 #if SIZEOF_INT < SIZEOF_SIZE_T
7383 #define NEED_RETRY
7384 #endif
7385 
7386 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7387    transcoding from UTF-16), but INT_MAX / 4 performs better in
7388    both cases also and avoids partial characters overrunning the
7389    length limit in MultiByteToWideChar on Windows */
7390 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7391 
7392 #ifndef WC_ERR_INVALID_CHARS
7393 #  define WC_ERR_INVALID_CHARS 0x0080
7394 #endif
7395 
7396 static const char*
code_page_name(UINT code_page,PyObject ** obj)7397 code_page_name(UINT code_page, PyObject **obj)
7398 {
7399     *obj = NULL;
7400     if (code_page == CP_ACP)
7401         return "mbcs";
7402     if (code_page == CP_UTF7)
7403         return "CP_UTF7";
7404     if (code_page == CP_UTF8)
7405         return "CP_UTF8";
7406 
7407     *obj = PyBytes_FromFormat("cp%u", code_page);
7408     if (*obj == NULL)
7409         return NULL;
7410     return PyBytes_AS_STRING(*obj);
7411 }
7412 
7413 static DWORD
decode_code_page_flags(UINT code_page)7414 decode_code_page_flags(UINT code_page)
7415 {
7416     if (code_page == CP_UTF7) {
7417         /* The CP_UTF7 decoder only supports flags=0 */
7418         return 0;
7419     }
7420     else
7421         return MB_ERR_INVALID_CHARS;
7422 }
7423 
7424 /*
7425  * Decode a byte string from a Windows code page into unicode object in strict
7426  * mode.
7427  *
7428  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7429  * OSError and returns -1 on other error.
7430  */
7431 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7432 decode_code_page_strict(UINT code_page,
7433                         wchar_t **buf,
7434                         Py_ssize_t *bufsize,
7435                         const char *in,
7436                         int insize)
7437 {
7438     DWORD flags = MB_ERR_INVALID_CHARS;
7439     wchar_t *out;
7440     DWORD outsize;
7441 
7442     /* First get the size of the result */
7443     assert(insize > 0);
7444     while ((outsize = MultiByteToWideChar(code_page, flags,
7445                                           in, insize, NULL, 0)) <= 0)
7446     {
7447         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7448             goto error;
7449         }
7450         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7451         flags = 0;
7452     }
7453 
7454     /* Extend a wchar_t* buffer */
7455     Py_ssize_t n = *bufsize;   /* Get the current length */
7456     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7457         return -1;
7458     }
7459     out = *buf + n;
7460 
7461     /* Do the conversion */
7462     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7463     if (outsize <= 0)
7464         goto error;
7465     return insize;
7466 
7467 error:
7468     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7469         return -2;
7470     PyErr_SetFromWindowsErr(0);
7471     return -1;
7472 }
7473 
7474 /*
7475  * Decode a byte string from a code page into unicode object with an error
7476  * handler.
7477  *
7478  * Returns consumed size if succeed, or raise an OSError or
7479  * UnicodeDecodeError exception and returns -1 on error.
7480  */
7481 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7482 decode_code_page_errors(UINT code_page,
7483                         wchar_t **buf,
7484                         Py_ssize_t *bufsize,
7485                         const char *in, const int size,
7486                         const char *errors, int final)
7487 {
7488     const char *startin = in;
7489     const char *endin = in + size;
7490     DWORD flags = MB_ERR_INVALID_CHARS;
7491     /* Ideally, we should get reason from FormatMessage. This is the Windows
7492        2000 English version of the message. */
7493     const char *reason = "No mapping for the Unicode character exists "
7494                          "in the target code page.";
7495     /* each step cannot decode more than 1 character, but a character can be
7496        represented as a surrogate pair */
7497     wchar_t buffer[2], *out;
7498     int insize;
7499     Py_ssize_t outsize;
7500     PyObject *errorHandler = NULL;
7501     PyObject *exc = NULL;
7502     PyObject *encoding_obj = NULL;
7503     const char *encoding;
7504     DWORD err;
7505     int ret = -1;
7506 
7507     assert(size > 0);
7508 
7509     encoding = code_page_name(code_page, &encoding_obj);
7510     if (encoding == NULL)
7511         return -1;
7512 
7513     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7514         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7515            UnicodeDecodeError. */
7516         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7517         if (exc != NULL) {
7518             PyCodec_StrictErrors(exc);
7519             Py_CLEAR(exc);
7520         }
7521         goto error;
7522     }
7523 
7524     /* Extend a wchar_t* buffer */
7525     Py_ssize_t n = *bufsize;   /* Get the current length */
7526     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7527         PyErr_NoMemory();
7528         goto error;
7529     }
7530     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7531         goto error;
7532     }
7533     out = *buf + n;
7534 
7535     /* Decode the byte string character per character */
7536     while (in < endin)
7537     {
7538         /* Decode a character */
7539         insize = 1;
7540         do
7541         {
7542             outsize = MultiByteToWideChar(code_page, flags,
7543                                           in, insize,
7544                                           buffer, Py_ARRAY_LENGTH(buffer));
7545             if (outsize > 0)
7546                 break;
7547             err = GetLastError();
7548             if (err == ERROR_INVALID_FLAGS && flags) {
7549                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7550                 flags = 0;
7551                 continue;
7552             }
7553             if (err != ERROR_NO_UNICODE_TRANSLATION
7554                 && err != ERROR_INSUFFICIENT_BUFFER)
7555             {
7556                 PyErr_SetFromWindowsErr(0);
7557                 goto error;
7558             }
7559             insize++;
7560         }
7561         /* 4=maximum length of a UTF-8 sequence */
7562         while (insize <= 4 && (in + insize) <= endin);
7563 
7564         if (outsize <= 0) {
7565             Py_ssize_t startinpos, endinpos, outpos;
7566 
7567             /* last character in partial decode? */
7568             if (in + insize >= endin && !final)
7569                 break;
7570 
7571             startinpos = in - startin;
7572             endinpos = startinpos + 1;
7573             outpos = out - *buf;
7574             if (unicode_decode_call_errorhandler_wchar(
7575                     errors, &errorHandler,
7576                     encoding, reason,
7577                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7578                     buf, bufsize, &outpos))
7579             {
7580                 goto error;
7581             }
7582             out = *buf + outpos;
7583         }
7584         else {
7585             in += insize;
7586             memcpy(out, buffer, outsize * sizeof(wchar_t));
7587             out += outsize;
7588         }
7589     }
7590 
7591     /* Shrink the buffer */
7592     assert(out - *buf <= *bufsize);
7593     *bufsize = out - *buf;
7594     /* (in - startin) <= size and size is an int */
7595     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7596 
7597 error:
7598     Py_XDECREF(encoding_obj);
7599     Py_XDECREF(errorHandler);
7600     Py_XDECREF(exc);
7601     return ret;
7602 }
7603 
7604 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7605 decode_code_page_stateful(int code_page,
7606                           const char *s, Py_ssize_t size,
7607                           const char *errors, Py_ssize_t *consumed)
7608 {
7609     wchar_t *buf = NULL;
7610     Py_ssize_t bufsize = 0;
7611     int chunk_size, final, converted, done;
7612 
7613     if (code_page < 0) {
7614         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7615         return NULL;
7616     }
7617     if (size < 0) {
7618         PyErr_BadInternalCall();
7619         return NULL;
7620     }
7621 
7622     if (consumed)
7623         *consumed = 0;
7624 
7625     do
7626     {
7627 #ifdef NEED_RETRY
7628         if (size > DECODING_CHUNK_SIZE) {
7629             chunk_size = DECODING_CHUNK_SIZE;
7630             final = 0;
7631             done = 0;
7632         }
7633         else
7634 #endif
7635         {
7636             chunk_size = (int)size;
7637             final = (consumed == NULL);
7638             done = 1;
7639         }
7640 
7641         if (chunk_size == 0 && done) {
7642             if (buf != NULL)
7643                 break;
7644             _Py_RETURN_UNICODE_EMPTY();
7645         }
7646 
7647         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7648                                             s, chunk_size);
7649         if (converted == -2)
7650             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7651                                                 s, chunk_size,
7652                                                 errors, final);
7653         assert(converted != 0 || done);
7654 
7655         if (converted < 0) {
7656             PyMem_Free(buf);
7657             return NULL;
7658         }
7659 
7660         if (consumed)
7661             *consumed += converted;
7662 
7663         s += converted;
7664         size -= converted;
7665     } while (!done);
7666 
7667     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7668     PyMem_Free(buf);
7669     return v;
7670 }
7671 
7672 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7673 PyUnicode_DecodeCodePageStateful(int code_page,
7674                                  const char *s,
7675                                  Py_ssize_t size,
7676                                  const char *errors,
7677                                  Py_ssize_t *consumed)
7678 {
7679     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7680 }
7681 
7682 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7683 PyUnicode_DecodeMBCSStateful(const char *s,
7684                              Py_ssize_t size,
7685                              const char *errors,
7686                              Py_ssize_t *consumed)
7687 {
7688     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7689 }
7690 
7691 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7692 PyUnicode_DecodeMBCS(const char *s,
7693                      Py_ssize_t size,
7694                      const char *errors)
7695 {
7696     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7697 }
7698 
7699 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7700 encode_code_page_flags(UINT code_page, const char *errors)
7701 {
7702     if (code_page == CP_UTF8) {
7703         return WC_ERR_INVALID_CHARS;
7704     }
7705     else if (code_page == CP_UTF7) {
7706         /* CP_UTF7 only supports flags=0 */
7707         return 0;
7708     }
7709     else {
7710         if (errors != NULL && strcmp(errors, "replace") == 0)
7711             return 0;
7712         else
7713             return WC_NO_BEST_FIT_CHARS;
7714     }
7715 }
7716 
7717 /*
7718  * Encode a Unicode string to a Windows code page into a byte string in strict
7719  * mode.
7720  *
7721  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7722  * an OSError and returns -1 on other error.
7723  */
7724 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7725 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7726                         PyObject *unicode, Py_ssize_t offset, int len,
7727                         const char* errors)
7728 {
7729     BOOL usedDefaultChar = FALSE;
7730     BOOL *pusedDefaultChar = &usedDefaultChar;
7731     int outsize;
7732     wchar_t *p;
7733     Py_ssize_t size;
7734     const DWORD flags = encode_code_page_flags(code_page, NULL);
7735     char *out;
7736     /* Create a substring so that we can get the UTF-16 representation
7737        of just the slice under consideration. */
7738     PyObject *substring;
7739     int ret = -1;
7740 
7741     assert(len > 0);
7742 
7743     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7744         pusedDefaultChar = &usedDefaultChar;
7745     else
7746         pusedDefaultChar = NULL;
7747 
7748     substring = PyUnicode_Substring(unicode, offset, offset+len);
7749     if (substring == NULL)
7750         return -1;
7751 #if USE_UNICODE_WCHAR_CACHE
7752 _Py_COMP_DIAG_PUSH
7753 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7754     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7755     if (p == NULL) {
7756         Py_DECREF(substring);
7757         return -1;
7758     }
7759 _Py_COMP_DIAG_POP
7760 #else /* USE_UNICODE_WCHAR_CACHE */
7761     p = PyUnicode_AsWideCharString(substring, &size);
7762     Py_CLEAR(substring);
7763     if (p == NULL) {
7764         return -1;
7765     }
7766 #endif /* USE_UNICODE_WCHAR_CACHE */
7767     assert(size <= INT_MAX);
7768 
7769     /* First get the size of the result */
7770     outsize = WideCharToMultiByte(code_page, flags,
7771                                   p, (int)size,
7772                                   NULL, 0,
7773                                   NULL, pusedDefaultChar);
7774     if (outsize <= 0)
7775         goto error;
7776     /* If we used a default char, then we failed! */
7777     if (pusedDefaultChar && *pusedDefaultChar) {
7778         ret = -2;
7779         goto done;
7780     }
7781 
7782     if (*outbytes == NULL) {
7783         /* Create string object */
7784         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7785         if (*outbytes == NULL) {
7786             goto done;
7787         }
7788         out = PyBytes_AS_STRING(*outbytes);
7789     }
7790     else {
7791         /* Extend string object */
7792         const Py_ssize_t n = PyBytes_Size(*outbytes);
7793         if (outsize > PY_SSIZE_T_MAX - n) {
7794             PyErr_NoMemory();
7795             goto done;
7796         }
7797         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7798             goto done;
7799         }
7800         out = PyBytes_AS_STRING(*outbytes) + n;
7801     }
7802 
7803     /* Do the conversion */
7804     outsize = WideCharToMultiByte(code_page, flags,
7805                                   p, (int)size,
7806                                   out, outsize,
7807                                   NULL, pusedDefaultChar);
7808     if (outsize <= 0)
7809         goto error;
7810     if (pusedDefaultChar && *pusedDefaultChar) {
7811         ret = -2;
7812         goto done;
7813     }
7814     ret = 0;
7815 
7816 done:
7817 #if USE_UNICODE_WCHAR_CACHE
7818     Py_DECREF(substring);
7819 #else /* USE_UNICODE_WCHAR_CACHE */
7820     PyMem_Free(p);
7821 #endif /* USE_UNICODE_WCHAR_CACHE */
7822     return ret;
7823 
7824 error:
7825     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7826         ret = -2;
7827         goto done;
7828     }
7829     PyErr_SetFromWindowsErr(0);
7830     goto done;
7831 }
7832 
7833 /*
7834  * Encode a Unicode string to a Windows code page into a byte string using an
7835  * error handler.
7836  *
7837  * Returns consumed characters if succeed, or raise an OSError and returns
7838  * -1 on other error.
7839  */
7840 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7841 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7842                         PyObject *unicode, Py_ssize_t unicode_offset,
7843                         Py_ssize_t insize, const char* errors)
7844 {
7845     const DWORD flags = encode_code_page_flags(code_page, errors);
7846     Py_ssize_t pos = unicode_offset;
7847     Py_ssize_t endin = unicode_offset + insize;
7848     /* Ideally, we should get reason from FormatMessage. This is the Windows
7849        2000 English version of the message. */
7850     const char *reason = "invalid character";
7851     /* 4=maximum length of a UTF-8 sequence */
7852     char buffer[4];
7853     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7854     Py_ssize_t outsize;
7855     char *out;
7856     PyObject *errorHandler = NULL;
7857     PyObject *exc = NULL;
7858     PyObject *encoding_obj = NULL;
7859     const char *encoding;
7860     Py_ssize_t newpos, newoutsize;
7861     PyObject *rep;
7862     int ret = -1;
7863 
7864     assert(insize > 0);
7865 
7866     encoding = code_page_name(code_page, &encoding_obj);
7867     if (encoding == NULL)
7868         return -1;
7869 
7870     if (errors == NULL || strcmp(errors, "strict") == 0) {
7871         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7872            then we raise a UnicodeEncodeError. */
7873         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7874         if (exc != NULL) {
7875             PyCodec_StrictErrors(exc);
7876             Py_DECREF(exc);
7877         }
7878         Py_XDECREF(encoding_obj);
7879         return -1;
7880     }
7881 
7882     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7883         pusedDefaultChar = &usedDefaultChar;
7884     else
7885         pusedDefaultChar = NULL;
7886 
7887     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7888         PyErr_NoMemory();
7889         goto error;
7890     }
7891     outsize = insize * Py_ARRAY_LENGTH(buffer);
7892 
7893     if (*outbytes == NULL) {
7894         /* Create string object */
7895         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7896         if (*outbytes == NULL)
7897             goto error;
7898         out = PyBytes_AS_STRING(*outbytes);
7899     }
7900     else {
7901         /* Extend string object */
7902         Py_ssize_t n = PyBytes_Size(*outbytes);
7903         if (n > PY_SSIZE_T_MAX - outsize) {
7904             PyErr_NoMemory();
7905             goto error;
7906         }
7907         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7908             goto error;
7909         out = PyBytes_AS_STRING(*outbytes) + n;
7910     }
7911 
7912     /* Encode the string character per character */
7913     while (pos < endin)
7914     {
7915         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7916         wchar_t chars[2];
7917         int charsize;
7918         if (ch < 0x10000) {
7919             chars[0] = (wchar_t)ch;
7920             charsize = 1;
7921         }
7922         else {
7923             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7924             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7925             charsize = 2;
7926         }
7927 
7928         outsize = WideCharToMultiByte(code_page, flags,
7929                                       chars, charsize,
7930                                       buffer, Py_ARRAY_LENGTH(buffer),
7931                                       NULL, pusedDefaultChar);
7932         if (outsize > 0) {
7933             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7934             {
7935                 pos++;
7936                 memcpy(out, buffer, outsize);
7937                 out += outsize;
7938                 continue;
7939             }
7940         }
7941         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7942             PyErr_SetFromWindowsErr(0);
7943             goto error;
7944         }
7945 
7946         rep = unicode_encode_call_errorhandler(
7947                   errors, &errorHandler, encoding, reason,
7948                   unicode, &exc,
7949                   pos, pos + 1, &newpos);
7950         if (rep == NULL)
7951             goto error;
7952 
7953         Py_ssize_t morebytes = pos - newpos;
7954         if (PyBytes_Check(rep)) {
7955             outsize = PyBytes_GET_SIZE(rep);
7956             morebytes += outsize;
7957             if (morebytes > 0) {
7958                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7959                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7960                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7961                     Py_DECREF(rep);
7962                     goto error;
7963                 }
7964                 out = PyBytes_AS_STRING(*outbytes) + offset;
7965             }
7966             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7967             out += outsize;
7968         }
7969         else {
7970             Py_ssize_t i;
7971             enum PyUnicode_Kind kind;
7972             const void *data;
7973 
7974             if (PyUnicode_READY(rep) == -1) {
7975                 Py_DECREF(rep);
7976                 goto error;
7977             }
7978 
7979             outsize = PyUnicode_GET_LENGTH(rep);
7980             morebytes += outsize;
7981             if (morebytes > 0) {
7982                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7983                 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7984                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7985                     Py_DECREF(rep);
7986                     goto error;
7987                 }
7988                 out = PyBytes_AS_STRING(*outbytes) + offset;
7989             }
7990             kind = PyUnicode_KIND(rep);
7991             data = PyUnicode_DATA(rep);
7992             for (i=0; i < outsize; i++) {
7993                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7994                 if (ch > 127) {
7995                     raise_encode_exception(&exc,
7996                         encoding, unicode,
7997                         pos, pos + 1,
7998                         "unable to encode error handler result to ASCII");
7999                     Py_DECREF(rep);
8000                     goto error;
8001                 }
8002                 *out = (unsigned char)ch;
8003                 out++;
8004             }
8005         }
8006         pos = newpos;
8007         Py_DECREF(rep);
8008     }
8009     /* write a NUL byte */
8010     *out = 0;
8011     outsize = out - PyBytes_AS_STRING(*outbytes);
8012     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8013     if (_PyBytes_Resize(outbytes, outsize) < 0)
8014         goto error;
8015     ret = 0;
8016 
8017 error:
8018     Py_XDECREF(encoding_obj);
8019     Py_XDECREF(errorHandler);
8020     Py_XDECREF(exc);
8021     return ret;
8022 }
8023 
8024 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8025 encode_code_page(int code_page,
8026                  PyObject *unicode,
8027                  const char *errors)
8028 {
8029     Py_ssize_t len;
8030     PyObject *outbytes = NULL;
8031     Py_ssize_t offset;
8032     int chunk_len, ret, done;
8033 
8034     if (!PyUnicode_Check(unicode)) {
8035         PyErr_BadArgument();
8036         return NULL;
8037     }
8038 
8039     if (PyUnicode_READY(unicode) == -1)
8040         return NULL;
8041     len = PyUnicode_GET_LENGTH(unicode);
8042 
8043     if (code_page < 0) {
8044         PyErr_SetString(PyExc_ValueError, "invalid code page number");
8045         return NULL;
8046     }
8047 
8048     if (len == 0)
8049         return PyBytes_FromStringAndSize(NULL, 0);
8050 
8051     offset = 0;
8052     do
8053     {
8054 #ifdef NEED_RETRY
8055         if (len > DECODING_CHUNK_SIZE) {
8056             chunk_len = DECODING_CHUNK_SIZE;
8057             done = 0;
8058         }
8059         else
8060 #endif
8061         {
8062             chunk_len = (int)len;
8063             done = 1;
8064         }
8065 
8066         ret = encode_code_page_strict(code_page, &outbytes,
8067                                       unicode, offset, chunk_len,
8068                                       errors);
8069         if (ret == -2)
8070             ret = encode_code_page_errors(code_page, &outbytes,
8071                                           unicode, offset,
8072                                           chunk_len, errors);
8073         if (ret < 0) {
8074             Py_XDECREF(outbytes);
8075             return NULL;
8076         }
8077 
8078         offset += chunk_len;
8079         len -= chunk_len;
8080     } while (!done);
8081 
8082     return outbytes;
8083 }
8084 
8085 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8086 PyUnicode_EncodeCodePage(int code_page,
8087                          PyObject *unicode,
8088                          const char *errors)
8089 {
8090     return encode_code_page(code_page, unicode, errors);
8091 }
8092 
8093 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8094 PyUnicode_AsMBCSString(PyObject *unicode)
8095 {
8096     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8097 }
8098 
8099 #undef NEED_RETRY
8100 
8101 #endif /* MS_WINDOWS */
8102 
8103 /* --- Character Mapping Codec -------------------------------------------- */
8104 
8105 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8106 charmap_decode_string(const char *s,
8107                       Py_ssize_t size,
8108                       PyObject *mapping,
8109                       const char *errors,
8110                       _PyUnicodeWriter *writer)
8111 {
8112     const char *starts = s;
8113     const char *e;
8114     Py_ssize_t startinpos, endinpos;
8115     PyObject *errorHandler = NULL, *exc = NULL;
8116     Py_ssize_t maplen;
8117     enum PyUnicode_Kind mapkind;
8118     const void *mapdata;
8119     Py_UCS4 x;
8120     unsigned char ch;
8121 
8122     if (PyUnicode_READY(mapping) == -1)
8123         return -1;
8124 
8125     maplen = PyUnicode_GET_LENGTH(mapping);
8126     mapdata = PyUnicode_DATA(mapping);
8127     mapkind = PyUnicode_KIND(mapping);
8128 
8129     e = s + size;
8130 
8131     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8132         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8133          * is disabled in encoding aliases, latin1 is preferred because
8134          * its implementation is faster. */
8135         const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8136         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8137         Py_UCS4 maxchar = writer->maxchar;
8138 
8139         assert (writer->kind == PyUnicode_1BYTE_KIND);
8140         while (s < e) {
8141             ch = *s;
8142             x = mapdata_ucs1[ch];
8143             if (x > maxchar) {
8144                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8145                     goto onError;
8146                 maxchar = writer->maxchar;
8147                 outdata = (Py_UCS1 *)writer->data;
8148             }
8149             outdata[writer->pos] = x;
8150             writer->pos++;
8151             ++s;
8152         }
8153         return 0;
8154     }
8155 
8156     while (s < e) {
8157         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8158             enum PyUnicode_Kind outkind = writer->kind;
8159             const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8160             if (outkind == PyUnicode_1BYTE_KIND) {
8161                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8162                 Py_UCS4 maxchar = writer->maxchar;
8163                 while (s < e) {
8164                     ch = *s;
8165                     x = mapdata_ucs2[ch];
8166                     if (x > maxchar)
8167                         goto Error;
8168                     outdata[writer->pos] = x;
8169                     writer->pos++;
8170                     ++s;
8171                 }
8172                 break;
8173             }
8174             else if (outkind == PyUnicode_2BYTE_KIND) {
8175                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8176                 while (s < e) {
8177                     ch = *s;
8178                     x = mapdata_ucs2[ch];
8179                     if (x == 0xFFFE)
8180                         goto Error;
8181                     outdata[writer->pos] = x;
8182                     writer->pos++;
8183                     ++s;
8184                 }
8185                 break;
8186             }
8187         }
8188         ch = *s;
8189 
8190         if (ch < maplen)
8191             x = PyUnicode_READ(mapkind, mapdata, ch);
8192         else
8193             x = 0xfffe; /* invalid value */
8194 Error:
8195         if (x == 0xfffe)
8196         {
8197             /* undefined mapping */
8198             startinpos = s-starts;
8199             endinpos = startinpos+1;
8200             if (unicode_decode_call_errorhandler_writer(
8201                     errors, &errorHandler,
8202                     "charmap", "character maps to <undefined>",
8203                     &starts, &e, &startinpos, &endinpos, &exc, &s,
8204                     writer)) {
8205                 goto onError;
8206             }
8207             continue;
8208         }
8209 
8210         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8211             goto onError;
8212         ++s;
8213     }
8214     Py_XDECREF(errorHandler);
8215     Py_XDECREF(exc);
8216     return 0;
8217 
8218 onError:
8219     Py_XDECREF(errorHandler);
8220     Py_XDECREF(exc);
8221     return -1;
8222 }
8223 
8224 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8225 charmap_decode_mapping(const char *s,
8226                        Py_ssize_t size,
8227                        PyObject *mapping,
8228                        const char *errors,
8229                        _PyUnicodeWriter *writer)
8230 {
8231     const char *starts = s;
8232     const char *e;
8233     Py_ssize_t startinpos, endinpos;
8234     PyObject *errorHandler = NULL, *exc = NULL;
8235     unsigned char ch;
8236     PyObject *key, *item = NULL;
8237 
8238     e = s + size;
8239 
8240     while (s < e) {
8241         ch = *s;
8242 
8243         /* Get mapping (char ordinal -> integer, Unicode char or None) */
8244         key = PyLong_FromLong((long)ch);
8245         if (key == NULL)
8246             goto onError;
8247 
8248         item = PyObject_GetItem(mapping, key);
8249         Py_DECREF(key);
8250         if (item == NULL) {
8251             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8252                 /* No mapping found means: mapping is undefined. */
8253                 PyErr_Clear();
8254                 goto Undefined;
8255             } else
8256                 goto onError;
8257         }
8258 
8259         /* Apply mapping */
8260         if (item == Py_None)
8261             goto Undefined;
8262         if (PyLong_Check(item)) {
8263             long value = PyLong_AS_LONG(item);
8264             if (value == 0xFFFE)
8265                 goto Undefined;
8266             if (value < 0 || value > MAX_UNICODE) {
8267                 PyErr_Format(PyExc_TypeError,
8268                              "character mapping must be in range(0x%x)",
8269                              (unsigned long)MAX_UNICODE + 1);
8270                 goto onError;
8271             }
8272 
8273             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8274                 goto onError;
8275         }
8276         else if (PyUnicode_Check(item)) {
8277             if (PyUnicode_READY(item) == -1)
8278                 goto onError;
8279             if (PyUnicode_GET_LENGTH(item) == 1) {
8280                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8281                 if (value == 0xFFFE)
8282                     goto Undefined;
8283                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8284                     goto onError;
8285             }
8286             else {
8287                 writer->overallocate = 1;
8288                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8289                     goto onError;
8290             }
8291         }
8292         else {
8293             /* wrong return value */
8294             PyErr_SetString(PyExc_TypeError,
8295                             "character mapping must return integer, None or str");
8296             goto onError;
8297         }
8298         Py_CLEAR(item);
8299         ++s;
8300         continue;
8301 
8302 Undefined:
8303         /* undefined mapping */
8304         Py_CLEAR(item);
8305         startinpos = s-starts;
8306         endinpos = startinpos+1;
8307         if (unicode_decode_call_errorhandler_writer(
8308                 errors, &errorHandler,
8309                 "charmap", "character maps to <undefined>",
8310                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8311                 writer)) {
8312             goto onError;
8313         }
8314     }
8315     Py_XDECREF(errorHandler);
8316     Py_XDECREF(exc);
8317     return 0;
8318 
8319 onError:
8320     Py_XDECREF(item);
8321     Py_XDECREF(errorHandler);
8322     Py_XDECREF(exc);
8323     return -1;
8324 }
8325 
8326 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8327 PyUnicode_DecodeCharmap(const char *s,
8328                         Py_ssize_t size,
8329                         PyObject *mapping,
8330                         const char *errors)
8331 {
8332     _PyUnicodeWriter writer;
8333 
8334     /* Default to Latin-1 */
8335     if (mapping == NULL)
8336         return PyUnicode_DecodeLatin1(s, size, errors);
8337 
8338     if (size == 0)
8339         _Py_RETURN_UNICODE_EMPTY();
8340     _PyUnicodeWriter_Init(&writer);
8341     writer.min_length = size;
8342     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8343         goto onError;
8344 
8345     if (PyUnicode_CheckExact(mapping)) {
8346         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8347             goto onError;
8348     }
8349     else {
8350         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8351             goto onError;
8352     }
8353     return _PyUnicodeWriter_Finish(&writer);
8354 
8355   onError:
8356     _PyUnicodeWriter_Dealloc(&writer);
8357     return NULL;
8358 }
8359 
8360 /* Charmap encoding: the lookup table */
8361 
8362 /*[clinic input]
8363 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8364 [clinic start generated code]*/
8365 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8366 
8367 struct encoding_map {
8368     PyObject_HEAD
8369     unsigned char level1[32];
8370     int count2, count3;
8371     unsigned char level23[1];
8372 };
8373 
8374 /*[clinic input]
8375 EncodingMap.size
8376 
8377 Return the size (in bytes) of this object.
8378 [clinic start generated code]*/
8379 
8380 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8381 EncodingMap_size_impl(struct encoding_map *self)
8382 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8383 {
8384     return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8385                            128*self->count3);
8386 }
8387 
8388 static PyMethodDef encoding_map_methods[] = {
8389     ENCODINGMAP_SIZE_METHODDEF
8390     {NULL, NULL}
8391 };
8392 
8393 static PyTypeObject EncodingMapType = {
8394     PyVarObject_HEAD_INIT(NULL, 0)
8395     .tp_name = "EncodingMap",
8396     .tp_basicsize = sizeof(struct encoding_map),
8397     /* methods */
8398     .tp_flags = Py_TPFLAGS_DEFAULT,
8399     .tp_methods = encoding_map_methods,
8400 };
8401 
8402 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8403 PyUnicode_BuildEncodingMap(PyObject* string)
8404 {
8405     PyObject *result;
8406     struct encoding_map *mresult;
8407     int i;
8408     int need_dict = 0;
8409     unsigned char level1[32];
8410     unsigned char level2[512];
8411     unsigned char *mlevel1, *mlevel2, *mlevel3;
8412     int count2 = 0, count3 = 0;
8413     int kind;
8414     const void *data;
8415     Py_ssize_t length;
8416     Py_UCS4 ch;
8417 
8418     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8419         PyErr_BadArgument();
8420         return NULL;
8421     }
8422     kind = PyUnicode_KIND(string);
8423     data = PyUnicode_DATA(string);
8424     length = PyUnicode_GET_LENGTH(string);
8425     length = Py_MIN(length, 256);
8426     memset(level1, 0xFF, sizeof level1);
8427     memset(level2, 0xFF, sizeof level2);
8428 
8429     /* If there isn't a one-to-one mapping of NULL to \0,
8430        or if there are non-BMP characters, we need to use
8431        a mapping dictionary. */
8432     if (PyUnicode_READ(kind, data, 0) != 0)
8433         need_dict = 1;
8434     for (i = 1; i < length; i++) {
8435         int l1, l2;
8436         ch = PyUnicode_READ(kind, data, i);
8437         if (ch == 0 || ch > 0xFFFF) {
8438             need_dict = 1;
8439             break;
8440         }
8441         if (ch == 0xFFFE)
8442             /* unmapped character */
8443             continue;
8444         l1 = ch >> 11;
8445         l2 = ch >> 7;
8446         if (level1[l1] == 0xFF)
8447             level1[l1] = count2++;
8448         if (level2[l2] == 0xFF)
8449             level2[l2] = count3++;
8450     }
8451 
8452     if (count2 >= 0xFF || count3 >= 0xFF)
8453         need_dict = 1;
8454 
8455     if (need_dict) {
8456         PyObject *result = PyDict_New();
8457         PyObject *key, *value;
8458         if (!result)
8459             return NULL;
8460         for (i = 0; i < length; i++) {
8461             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8462             value = PyLong_FromLong(i);
8463             if (!key || !value)
8464                 goto failed1;
8465             if (PyDict_SetItem(result, key, value) == -1)
8466                 goto failed1;
8467             Py_DECREF(key);
8468             Py_DECREF(value);
8469         }
8470         return result;
8471       failed1:
8472         Py_XDECREF(key);
8473         Py_XDECREF(value);
8474         Py_DECREF(result);
8475         return NULL;
8476     }
8477 
8478     /* Create a three-level trie */
8479     result = PyObject_Malloc(sizeof(struct encoding_map) +
8480                              16*count2 + 128*count3 - 1);
8481     if (!result) {
8482         return PyErr_NoMemory();
8483     }
8484 
8485     _PyObject_Init(result, &EncodingMapType);
8486     mresult = (struct encoding_map*)result;
8487     mresult->count2 = count2;
8488     mresult->count3 = count3;
8489     mlevel1 = mresult->level1;
8490     mlevel2 = mresult->level23;
8491     mlevel3 = mresult->level23 + 16*count2;
8492     memcpy(mlevel1, level1, 32);
8493     memset(mlevel2, 0xFF, 16*count2);
8494     memset(mlevel3, 0, 128*count3);
8495     count3 = 0;
8496     for (i = 1; i < length; i++) {
8497         int o1, o2, o3, i2, i3;
8498         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8499         if (ch == 0xFFFE)
8500             /* unmapped character */
8501             continue;
8502         o1 = ch>>11;
8503         o2 = (ch>>7) & 0xF;
8504         i2 = 16*mlevel1[o1] + o2;
8505         if (mlevel2[i2] == 0xFF)
8506             mlevel2[i2] = count3++;
8507         o3 = ch & 0x7F;
8508         i3 = 128*mlevel2[i2] + o3;
8509         mlevel3[i3] = i;
8510     }
8511     return result;
8512 }
8513 
8514 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8515 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8516 {
8517     struct encoding_map *map = (struct encoding_map*)mapping;
8518     int l1 = c>>11;
8519     int l2 = (c>>7) & 0xF;
8520     int l3 = c & 0x7F;
8521     int i;
8522 
8523     if (c > 0xFFFF)
8524         return -1;
8525     if (c == 0)
8526         return 0;
8527     /* level 1*/
8528     i = map->level1[l1];
8529     if (i == 0xFF) {
8530         return -1;
8531     }
8532     /* level 2*/
8533     i = map->level23[16*i+l2];
8534     if (i == 0xFF) {
8535         return -1;
8536     }
8537     /* level 3 */
8538     i = map->level23[16*map->count2 + 128*i + l3];
8539     if (i == 0) {
8540         return -1;
8541     }
8542     return i;
8543 }
8544 
8545 /* Lookup the character ch in the mapping. If the character
8546    can't be found, Py_None is returned (or NULL, if another
8547    error occurred). */
8548 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8549 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8550 {
8551     PyObject *w = PyLong_FromLong((long)c);
8552     PyObject *x;
8553 
8554     if (w == NULL)
8555         return NULL;
8556     x = PyObject_GetItem(mapping, w);
8557     Py_DECREF(w);
8558     if (x == NULL) {
8559         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8560             /* No mapping found means: mapping is undefined. */
8561             PyErr_Clear();
8562             Py_RETURN_NONE;
8563         } else
8564             return NULL;
8565     }
8566     else if (x == Py_None)
8567         return x;
8568     else if (PyLong_Check(x)) {
8569         long value = PyLong_AS_LONG(x);
8570         if (value < 0 || value > 255) {
8571             PyErr_SetString(PyExc_TypeError,
8572                             "character mapping must be in range(256)");
8573             Py_DECREF(x);
8574             return NULL;
8575         }
8576         return x;
8577     }
8578     else if (PyBytes_Check(x))
8579         return x;
8580     else {
8581         /* wrong return value */
8582         PyErr_Format(PyExc_TypeError,
8583                      "character mapping must return integer, bytes or None, not %.400s",
8584                      Py_TYPE(x)->tp_name);
8585         Py_DECREF(x);
8586         return NULL;
8587     }
8588 }
8589 
8590 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8591 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8592 {
8593     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8594     /* exponentially overallocate to minimize reallocations */
8595     if (requiredsize < 2*outsize)
8596         requiredsize = 2*outsize;
8597     if (_PyBytes_Resize(outobj, requiredsize))
8598         return -1;
8599     return 0;
8600 }
8601 
8602 typedef enum charmapencode_result {
8603     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8604 } charmapencode_result;
8605 /* lookup the character, put the result in the output string and adjust
8606    various state variables. Resize the output bytes object if not enough
8607    space is available. Return a new reference to the object that
8608    was put in the output buffer, or Py_None, if the mapping was undefined
8609    (in which case no character was written) or NULL, if a
8610    reallocation error occurred. The caller must decref the result */
8611 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8612 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8613                      PyObject **outobj, Py_ssize_t *outpos)
8614 {
8615     PyObject *rep;
8616     char *outstart;
8617     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8618 
8619     if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8620         int res = encoding_map_lookup(c, mapping);
8621         Py_ssize_t requiredsize = *outpos+1;
8622         if (res == -1)
8623             return enc_FAILED;
8624         if (outsize<requiredsize)
8625             if (charmapencode_resize(outobj, outpos, requiredsize))
8626                 return enc_EXCEPTION;
8627         outstart = PyBytes_AS_STRING(*outobj);
8628         outstart[(*outpos)++] = (char)res;
8629         return enc_SUCCESS;
8630     }
8631 
8632     rep = charmapencode_lookup(c, mapping);
8633     if (rep==NULL)
8634         return enc_EXCEPTION;
8635     else if (rep==Py_None) {
8636         Py_DECREF(rep);
8637         return enc_FAILED;
8638     } else {
8639         if (PyLong_Check(rep)) {
8640             Py_ssize_t requiredsize = *outpos+1;
8641             if (outsize<requiredsize)
8642                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8643                     Py_DECREF(rep);
8644                     return enc_EXCEPTION;
8645                 }
8646             outstart = PyBytes_AS_STRING(*outobj);
8647             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8648         }
8649         else {
8650             const char *repchars = PyBytes_AS_STRING(rep);
8651             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8652             Py_ssize_t requiredsize = *outpos+repsize;
8653             if (outsize<requiredsize)
8654                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8655                     Py_DECREF(rep);
8656                     return enc_EXCEPTION;
8657                 }
8658             outstart = PyBytes_AS_STRING(*outobj);
8659             memcpy(outstart + *outpos, repchars, repsize);
8660             *outpos += repsize;
8661         }
8662     }
8663     Py_DECREF(rep);
8664     return enc_SUCCESS;
8665 }
8666 
8667 /* handle an error in PyUnicode_EncodeCharmap
8668    Return 0 on success, -1 on error */
8669 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8670 charmap_encoding_error(
8671     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8672     PyObject **exceptionObject,
8673     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8674     PyObject **res, Py_ssize_t *respos)
8675 {
8676     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8677     Py_ssize_t size, repsize;
8678     Py_ssize_t newpos;
8679     enum PyUnicode_Kind kind;
8680     const void *data;
8681     Py_ssize_t index;
8682     /* startpos for collecting unencodable chars */
8683     Py_ssize_t collstartpos = *inpos;
8684     Py_ssize_t collendpos = *inpos+1;
8685     Py_ssize_t collpos;
8686     const char *encoding = "charmap";
8687     const char *reason = "character maps to <undefined>";
8688     charmapencode_result x;
8689     Py_UCS4 ch;
8690     int val;
8691 
8692     if (PyUnicode_READY(unicode) == -1)
8693         return -1;
8694     size = PyUnicode_GET_LENGTH(unicode);
8695     /* find all unencodable characters */
8696     while (collendpos < size) {
8697         PyObject *rep;
8698         if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8699             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8700             val = encoding_map_lookup(ch, mapping);
8701             if (val != -1)
8702                 break;
8703             ++collendpos;
8704             continue;
8705         }
8706 
8707         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8708         rep = charmapencode_lookup(ch, mapping);
8709         if (rep==NULL)
8710             return -1;
8711         else if (rep!=Py_None) {
8712             Py_DECREF(rep);
8713             break;
8714         }
8715         Py_DECREF(rep);
8716         ++collendpos;
8717     }
8718     /* cache callback name lookup
8719      * (if not done yet, i.e. it's the first error) */
8720     if (*error_handler == _Py_ERROR_UNKNOWN)
8721         *error_handler = _Py_GetErrorHandler(errors);
8722 
8723     switch (*error_handler) {
8724     case _Py_ERROR_STRICT:
8725         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8726         return -1;
8727 
8728     case _Py_ERROR_REPLACE:
8729         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8730             x = charmapencode_output('?', mapping, res, respos);
8731             if (x==enc_EXCEPTION) {
8732                 return -1;
8733             }
8734             else if (x==enc_FAILED) {
8735                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8736                 return -1;
8737             }
8738         }
8739         /* fall through */
8740     case _Py_ERROR_IGNORE:
8741         *inpos = collendpos;
8742         break;
8743 
8744     case _Py_ERROR_XMLCHARREFREPLACE:
8745         /* generate replacement (temporarily (mis)uses p) */
8746         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8747             char buffer[2+29+1+1];
8748             char *cp;
8749             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8750             for (cp = buffer; *cp; ++cp) {
8751                 x = charmapencode_output(*cp, mapping, res, respos);
8752                 if (x==enc_EXCEPTION)
8753                     return -1;
8754                 else if (x==enc_FAILED) {
8755                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8756                     return -1;
8757                 }
8758             }
8759         }
8760         *inpos = collendpos;
8761         break;
8762 
8763     default:
8764         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8765                                                       encoding, reason, unicode, exceptionObject,
8766                                                       collstartpos, collendpos, &newpos);
8767         if (repunicode == NULL)
8768             return -1;
8769         if (PyBytes_Check(repunicode)) {
8770             /* Directly copy bytes result to output. */
8771             Py_ssize_t outsize = PyBytes_Size(*res);
8772             Py_ssize_t requiredsize;
8773             repsize = PyBytes_Size(repunicode);
8774             requiredsize = *respos + repsize;
8775             if (requiredsize > outsize)
8776                 /* Make room for all additional bytes. */
8777                 if (charmapencode_resize(res, respos, requiredsize)) {
8778                     Py_DECREF(repunicode);
8779                     return -1;
8780                 }
8781             memcpy(PyBytes_AsString(*res) + *respos,
8782                    PyBytes_AsString(repunicode),  repsize);
8783             *respos += repsize;
8784             *inpos = newpos;
8785             Py_DECREF(repunicode);
8786             break;
8787         }
8788         /* generate replacement  */
8789         if (PyUnicode_READY(repunicode) == -1) {
8790             Py_DECREF(repunicode);
8791             return -1;
8792         }
8793         repsize = PyUnicode_GET_LENGTH(repunicode);
8794         data = PyUnicode_DATA(repunicode);
8795         kind = PyUnicode_KIND(repunicode);
8796         for (index = 0; index < repsize; index++) {
8797             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8798             x = charmapencode_output(repch, mapping, res, respos);
8799             if (x==enc_EXCEPTION) {
8800                 Py_DECREF(repunicode);
8801                 return -1;
8802             }
8803             else if (x==enc_FAILED) {
8804                 Py_DECREF(repunicode);
8805                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8806                 return -1;
8807             }
8808         }
8809         *inpos = newpos;
8810         Py_DECREF(repunicode);
8811     }
8812     return 0;
8813 }
8814 
8815 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8816 _PyUnicode_EncodeCharmap(PyObject *unicode,
8817                          PyObject *mapping,
8818                          const char *errors)
8819 {
8820     /* output object */
8821     PyObject *res = NULL;
8822     /* current input position */
8823     Py_ssize_t inpos = 0;
8824     Py_ssize_t size;
8825     /* current output position */
8826     Py_ssize_t respos = 0;
8827     PyObject *error_handler_obj = NULL;
8828     PyObject *exc = NULL;
8829     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8830     const void *data;
8831     int kind;
8832 
8833     if (PyUnicode_READY(unicode) == -1)
8834         return NULL;
8835     size = PyUnicode_GET_LENGTH(unicode);
8836     data = PyUnicode_DATA(unicode);
8837     kind = PyUnicode_KIND(unicode);
8838 
8839     /* Default to Latin-1 */
8840     if (mapping == NULL)
8841         return unicode_encode_ucs1(unicode, errors, 256);
8842 
8843     /* allocate enough for a simple encoding without
8844        replacements, if we need more, we'll resize */
8845     res = PyBytes_FromStringAndSize(NULL, size);
8846     if (res == NULL)
8847         goto onError;
8848     if (size == 0)
8849         return res;
8850 
8851     while (inpos<size) {
8852         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8853         /* try to encode it */
8854         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8855         if (x==enc_EXCEPTION) /* error */
8856             goto onError;
8857         if (x==enc_FAILED) { /* unencodable character */
8858             if (charmap_encoding_error(unicode, &inpos, mapping,
8859                                        &exc,
8860                                        &error_handler, &error_handler_obj, errors,
8861                                        &res, &respos)) {
8862                 goto onError;
8863             }
8864         }
8865         else
8866             /* done with this character => adjust input position */
8867             ++inpos;
8868     }
8869 
8870     /* Resize if we allocated to much */
8871     if (respos<PyBytes_GET_SIZE(res))
8872         if (_PyBytes_Resize(&res, respos) < 0)
8873             goto onError;
8874 
8875     Py_XDECREF(exc);
8876     Py_XDECREF(error_handler_obj);
8877     return res;
8878 
8879   onError:
8880     Py_XDECREF(res);
8881     Py_XDECREF(exc);
8882     Py_XDECREF(error_handler_obj);
8883     return NULL;
8884 }
8885 
8886 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8887 PyUnicode_AsCharmapString(PyObject *unicode,
8888                           PyObject *mapping)
8889 {
8890     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8891         PyErr_BadArgument();
8892         return NULL;
8893     }
8894     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8895 }
8896 
8897 /* create or adjust a UnicodeTranslateError */
8898 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8899 make_translate_exception(PyObject **exceptionObject,
8900                          PyObject *unicode,
8901                          Py_ssize_t startpos, Py_ssize_t endpos,
8902                          const char *reason)
8903 {
8904     if (*exceptionObject == NULL) {
8905         *exceptionObject = _PyUnicodeTranslateError_Create(
8906             unicode, startpos, endpos, reason);
8907     }
8908     else {
8909         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8910             goto onError;
8911         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8912             goto onError;
8913         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8914             goto onError;
8915         return;
8916       onError:
8917         Py_CLEAR(*exceptionObject);
8918     }
8919 }
8920 
8921 /* error handling callback helper:
8922    build arguments, call the callback and check the arguments,
8923    put the result into newpos and return the replacement string, which
8924    has to be freed by the caller */
8925 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8926 unicode_translate_call_errorhandler(const char *errors,
8927                                     PyObject **errorHandler,
8928                                     const char *reason,
8929                                     PyObject *unicode, PyObject **exceptionObject,
8930                                     Py_ssize_t startpos, Py_ssize_t endpos,
8931                                     Py_ssize_t *newpos)
8932 {
8933     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8934 
8935     Py_ssize_t i_newpos;
8936     PyObject *restuple;
8937     PyObject *resunicode;
8938 
8939     if (*errorHandler == NULL) {
8940         *errorHandler = PyCodec_LookupError(errors);
8941         if (*errorHandler == NULL)
8942             return NULL;
8943     }
8944 
8945     make_translate_exception(exceptionObject,
8946                              unicode, startpos, endpos, reason);
8947     if (*exceptionObject == NULL)
8948         return NULL;
8949 
8950     restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8951     if (restuple == NULL)
8952         return NULL;
8953     if (!PyTuple_Check(restuple)) {
8954         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8955         Py_DECREF(restuple);
8956         return NULL;
8957     }
8958     if (!PyArg_ParseTuple(restuple, argparse,
8959                           &resunicode, &i_newpos)) {
8960         Py_DECREF(restuple);
8961         return NULL;
8962     }
8963     if (i_newpos<0)
8964         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8965     else
8966         *newpos = i_newpos;
8967     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8968         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8969         Py_DECREF(restuple);
8970         return NULL;
8971     }
8972     Py_INCREF(resunicode);
8973     Py_DECREF(restuple);
8974     return resunicode;
8975 }
8976 
8977 /* Lookup the character ch in the mapping and put the result in result,
8978    which must be decrefed by the caller.
8979    Return 0 on success, -1 on error */
8980 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8981 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8982 {
8983     PyObject *w = PyLong_FromLong((long)c);
8984     PyObject *x;
8985 
8986     if (w == NULL)
8987         return -1;
8988     x = PyObject_GetItem(mapping, w);
8989     Py_DECREF(w);
8990     if (x == NULL) {
8991         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8992             /* No mapping found means: use 1:1 mapping. */
8993             PyErr_Clear();
8994             *result = NULL;
8995             return 0;
8996         } else
8997             return -1;
8998     }
8999     else if (x == Py_None) {
9000         *result = x;
9001         return 0;
9002     }
9003     else if (PyLong_Check(x)) {
9004         long value = PyLong_AS_LONG(x);
9005         if (value < 0 || value > MAX_UNICODE) {
9006             PyErr_Format(PyExc_ValueError,
9007                          "character mapping must be in range(0x%x)",
9008                          MAX_UNICODE+1);
9009             Py_DECREF(x);
9010             return -1;
9011         }
9012         *result = x;
9013         return 0;
9014     }
9015     else if (PyUnicode_Check(x)) {
9016         *result = x;
9017         return 0;
9018     }
9019     else {
9020         /* wrong return value */
9021         PyErr_SetString(PyExc_TypeError,
9022                         "character mapping must return integer, None or str");
9023         Py_DECREF(x);
9024         return -1;
9025     }
9026 }
9027 
9028 /* lookup the character, write the result into the writer.
9029    Return 1 if the result was written into the writer, return 0 if the mapping
9030    was undefined, raise an exception return -1 on error. */
9031 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9032 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9033                         _PyUnicodeWriter *writer)
9034 {
9035     PyObject *item;
9036 
9037     if (charmaptranslate_lookup(ch, mapping, &item))
9038         return -1;
9039 
9040     if (item == NULL) {
9041         /* not found => default to 1:1 mapping */
9042         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9043             return -1;
9044         }
9045         return 1;
9046     }
9047 
9048     if (item == Py_None) {
9049         Py_DECREF(item);
9050         return 0;
9051     }
9052 
9053     if (PyLong_Check(item)) {
9054         long ch = (Py_UCS4)PyLong_AS_LONG(item);
9055         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9056            used it */
9057         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9058             Py_DECREF(item);
9059             return -1;
9060         }
9061         Py_DECREF(item);
9062         return 1;
9063     }
9064 
9065     if (!PyUnicode_Check(item)) {
9066         Py_DECREF(item);
9067         return -1;
9068     }
9069 
9070     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9071         Py_DECREF(item);
9072         return -1;
9073     }
9074 
9075     Py_DECREF(item);
9076     return 1;
9077 }
9078 
9079 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9080 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9081                               Py_UCS1 *translate)
9082 {
9083     PyObject *item = NULL;
9084     int ret = 0;
9085 
9086     if (charmaptranslate_lookup(ch, mapping, &item)) {
9087         return -1;
9088     }
9089 
9090     if (item == Py_None) {
9091         /* deletion */
9092         translate[ch] = 0xfe;
9093     }
9094     else if (item == NULL) {
9095         /* not found => default to 1:1 mapping */
9096         translate[ch] = ch;
9097         return 1;
9098     }
9099     else if (PyLong_Check(item)) {
9100         long replace = PyLong_AS_LONG(item);
9101         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9102            used it */
9103         if (127 < replace) {
9104             /* invalid character or character outside ASCII:
9105                skip the fast translate */
9106             goto exit;
9107         }
9108         translate[ch] = (Py_UCS1)replace;
9109     }
9110     else if (PyUnicode_Check(item)) {
9111         Py_UCS4 replace;
9112 
9113         if (PyUnicode_READY(item) == -1) {
9114             Py_DECREF(item);
9115             return -1;
9116         }
9117         if (PyUnicode_GET_LENGTH(item) != 1)
9118             goto exit;
9119 
9120         replace = PyUnicode_READ_CHAR(item, 0);
9121         if (replace > 127)
9122             goto exit;
9123         translate[ch] = (Py_UCS1)replace;
9124     }
9125     else {
9126         /* not None, NULL, long or unicode */
9127         goto exit;
9128     }
9129     ret = 1;
9130 
9131   exit:
9132     Py_DECREF(item);
9133     return ret;
9134 }
9135 
9136 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9137    was translated into writer, return 0 if the input string was partially
9138    translated into writer, raise an exception and return -1 on error. */
9139 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9140 unicode_fast_translate(PyObject *input, PyObject *mapping,
9141                        _PyUnicodeWriter *writer, int ignore,
9142                        Py_ssize_t *input_pos)
9143 {
9144     Py_UCS1 ascii_table[128], ch, ch2;
9145     Py_ssize_t len;
9146     const Py_UCS1 *in, *end;
9147     Py_UCS1 *out;
9148     int res = 0;
9149 
9150     len = PyUnicode_GET_LENGTH(input);
9151 
9152     memset(ascii_table, 0xff, 128);
9153 
9154     in = PyUnicode_1BYTE_DATA(input);
9155     end = in + len;
9156 
9157     assert(PyUnicode_IS_ASCII(writer->buffer));
9158     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9159     out = PyUnicode_1BYTE_DATA(writer->buffer);
9160 
9161     for (; in < end; in++) {
9162         ch = *in;
9163         ch2 = ascii_table[ch];
9164         if (ch2 == 0xff) {
9165             int translate = unicode_fast_translate_lookup(mapping, ch,
9166                                                           ascii_table);
9167             if (translate < 0)
9168                 return -1;
9169             if (translate == 0)
9170                 goto exit;
9171             ch2 = ascii_table[ch];
9172         }
9173         if (ch2 == 0xfe) {
9174             if (ignore)
9175                 continue;
9176             goto exit;
9177         }
9178         assert(ch2 < 128);
9179         *out = ch2;
9180         out++;
9181     }
9182     res = 1;
9183 
9184 exit:
9185     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9186     *input_pos = in - PyUnicode_1BYTE_DATA(input);
9187     return res;
9188 }
9189 
9190 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9191 _PyUnicode_TranslateCharmap(PyObject *input,
9192                             PyObject *mapping,
9193                             const char *errors)
9194 {
9195     /* input object */
9196     const void *data;
9197     Py_ssize_t size, i;
9198     int kind;
9199     /* output buffer */
9200     _PyUnicodeWriter writer;
9201     /* error handler */
9202     const char *reason = "character maps to <undefined>";
9203     PyObject *errorHandler = NULL;
9204     PyObject *exc = NULL;
9205     int ignore;
9206     int res;
9207 
9208     if (mapping == NULL) {
9209         PyErr_BadArgument();
9210         return NULL;
9211     }
9212 
9213     if (PyUnicode_READY(input) == -1)
9214         return NULL;
9215     data = PyUnicode_DATA(input);
9216     kind = PyUnicode_KIND(input);
9217     size = PyUnicode_GET_LENGTH(input);
9218 
9219     if (size == 0)
9220         return PyUnicode_FromObject(input);
9221 
9222     /* allocate enough for a simple 1:1 translation without
9223        replacements, if we need more, we'll resize */
9224     _PyUnicodeWriter_Init(&writer);
9225     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9226         goto onError;
9227 
9228     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9229 
9230     if (PyUnicode_READY(input) == -1)
9231         return NULL;
9232     if (PyUnicode_IS_ASCII(input)) {
9233         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9234         if (res < 0) {
9235             _PyUnicodeWriter_Dealloc(&writer);
9236             return NULL;
9237         }
9238         if (res == 1)
9239             return _PyUnicodeWriter_Finish(&writer);
9240     }
9241     else {
9242         i = 0;
9243     }
9244 
9245     while (i<size) {
9246         /* try to encode it */
9247         int translate;
9248         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9249         Py_ssize_t newpos;
9250         /* startpos for collecting untranslatable chars */
9251         Py_ssize_t collstart;
9252         Py_ssize_t collend;
9253         Py_UCS4 ch;
9254 
9255         ch = PyUnicode_READ(kind, data, i);
9256         translate = charmaptranslate_output(ch, mapping, &writer);
9257         if (translate < 0)
9258             goto onError;
9259 
9260         if (translate != 0) {
9261             /* it worked => adjust input pointer */
9262             ++i;
9263             continue;
9264         }
9265 
9266         /* untranslatable character */
9267         collstart = i;
9268         collend = i+1;
9269 
9270         /* find all untranslatable characters */
9271         while (collend < size) {
9272             PyObject *x;
9273             ch = PyUnicode_READ(kind, data, collend);
9274             if (charmaptranslate_lookup(ch, mapping, &x))
9275                 goto onError;
9276             Py_XDECREF(x);
9277             if (x != Py_None)
9278                 break;
9279             ++collend;
9280         }
9281 
9282         if (ignore) {
9283             i = collend;
9284         }
9285         else {
9286             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9287                                                              reason, input, &exc,
9288                                                              collstart, collend, &newpos);
9289             if (repunicode == NULL)
9290                 goto onError;
9291             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9292                 Py_DECREF(repunicode);
9293                 goto onError;
9294             }
9295             Py_DECREF(repunicode);
9296             i = newpos;
9297         }
9298     }
9299     Py_XDECREF(exc);
9300     Py_XDECREF(errorHandler);
9301     return _PyUnicodeWriter_Finish(&writer);
9302 
9303   onError:
9304     _PyUnicodeWriter_Dealloc(&writer);
9305     Py_XDECREF(exc);
9306     Py_XDECREF(errorHandler);
9307     return NULL;
9308 }
9309 
9310 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9311 PyUnicode_Translate(PyObject *str,
9312                     PyObject *mapping,
9313                     const char *errors)
9314 {
9315     if (ensure_unicode(str) < 0)
9316         return NULL;
9317     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9318 }
9319 
9320 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9321 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9322 {
9323     if (!PyUnicode_Check(unicode)) {
9324         PyErr_BadInternalCall();
9325         return NULL;
9326     }
9327     if (PyUnicode_READY(unicode) == -1)
9328         return NULL;
9329     if (PyUnicode_IS_ASCII(unicode)) {
9330         /* If the string is already ASCII, just return the same string */
9331         Py_INCREF(unicode);
9332         return unicode;
9333     }
9334 
9335     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9336     PyObject *result = PyUnicode_New(len, 127);
9337     if (result == NULL) {
9338         return NULL;
9339     }
9340 
9341     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9342     int kind = PyUnicode_KIND(unicode);
9343     const void *data = PyUnicode_DATA(unicode);
9344     Py_ssize_t i;
9345     for (i = 0; i < len; ++i) {
9346         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9347         if (ch < 127) {
9348             out[i] = ch;
9349         }
9350         else if (Py_UNICODE_ISSPACE(ch)) {
9351             out[i] = ' ';
9352         }
9353         else {
9354             int decimal = Py_UNICODE_TODECIMAL(ch);
9355             if (decimal < 0) {
9356                 out[i] = '?';
9357                 out[i+1] = '\0';
9358                 _PyUnicode_LENGTH(result) = i + 1;
9359                 break;
9360             }
9361             out[i] = '0' + decimal;
9362         }
9363     }
9364 
9365     assert(_PyUnicode_CheckConsistency(result, 1));
9366     return result;
9367 }
9368 
9369 /* --- Helpers ------------------------------------------------------------ */
9370 
9371 /* helper macro to fixup start/end slice values */
9372 #define ADJUST_INDICES(start, end, len)         \
9373     if (end > len)                              \
9374         end = len;                              \
9375     else if (end < 0) {                         \
9376         end += len;                             \
9377         if (end < 0)                            \
9378             end = 0;                            \
9379     }                                           \
9380     if (start < 0) {                            \
9381         start += len;                           \
9382         if (start < 0)                          \
9383             start = 0;                          \
9384     }
9385 
9386 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9387 any_find_slice(PyObject* s1, PyObject* s2,
9388                Py_ssize_t start,
9389                Py_ssize_t end,
9390                int direction)
9391 {
9392     int kind1, kind2;
9393     const void *buf1, *buf2;
9394     Py_ssize_t len1, len2, result;
9395 
9396     kind1 = PyUnicode_KIND(s1);
9397     kind2 = PyUnicode_KIND(s2);
9398     if (kind1 < kind2)
9399         return -1;
9400 
9401     len1 = PyUnicode_GET_LENGTH(s1);
9402     len2 = PyUnicode_GET_LENGTH(s2);
9403     ADJUST_INDICES(start, end, len1);
9404     if (end - start < len2)
9405         return -1;
9406 
9407     buf1 = PyUnicode_DATA(s1);
9408     buf2 = PyUnicode_DATA(s2);
9409     if (len2 == 1) {
9410         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9411         result = findchar((const char *)buf1 + kind1*start,
9412                           kind1, end - start, ch, direction);
9413         if (result == -1)
9414             return -1;
9415         else
9416             return start + result;
9417     }
9418 
9419     if (kind2 != kind1) {
9420         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9421         if (!buf2)
9422             return -2;
9423     }
9424 
9425     if (direction > 0) {
9426         switch (kind1) {
9427         case PyUnicode_1BYTE_KIND:
9428             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9429                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9430             else
9431                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9432             break;
9433         case PyUnicode_2BYTE_KIND:
9434             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9435             break;
9436         case PyUnicode_4BYTE_KIND:
9437             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9438             break;
9439         default:
9440             Py_UNREACHABLE();
9441         }
9442     }
9443     else {
9444         switch (kind1) {
9445         case PyUnicode_1BYTE_KIND:
9446             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9447                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9448             else
9449                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450             break;
9451         case PyUnicode_2BYTE_KIND:
9452             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453             break;
9454         case PyUnicode_4BYTE_KIND:
9455             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456             break;
9457         default:
9458             Py_UNREACHABLE();
9459         }
9460     }
9461 
9462     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9463     if (kind2 != kind1)
9464         PyMem_Free((void *)buf2);
9465 
9466     return result;
9467 }
9468 
9469 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9470 #include "stringlib/localeutil.h"
9471 
9472 /**
9473  * InsertThousandsGrouping:
9474  * @writer: Unicode writer.
9475  * @n_buffer: Number of characters in @buffer.
9476  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9477  * @d_pos: Start of digits string.
9478  * @n_digits: The number of digits in the string, in which we want
9479  *            to put the grouping chars.
9480  * @min_width: The minimum width of the digits in the output string.
9481  *             Output will be zero-padded on the left to fill.
9482  * @grouping: see definition in localeconv().
9483  * @thousands_sep: see definition in localeconv().
9484  *
9485  * There are 2 modes: counting and filling. If @writer is NULL,
9486  *  we are in counting mode, else filling mode.
9487  * If counting, the required buffer size is returned.
9488  * If filling, we know the buffer will be large enough, so we don't
9489  *  need to pass in the buffer size.
9490  * Inserts thousand grouping characters (as defined by grouping and
9491  *  thousands_sep) into @writer.
9492  *
9493  * Return value: -1 on error, number of characters otherwise.
9494  **/
9495 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9496 _PyUnicode_InsertThousandsGrouping(
9497     _PyUnicodeWriter *writer,
9498     Py_ssize_t n_buffer,
9499     PyObject *digits,
9500     Py_ssize_t d_pos,
9501     Py_ssize_t n_digits,
9502     Py_ssize_t min_width,
9503     const char *grouping,
9504     PyObject *thousands_sep,
9505     Py_UCS4 *maxchar)
9506 {
9507     min_width = Py_MAX(0, min_width);
9508     if (writer) {
9509         assert(digits != NULL);
9510         assert(maxchar == NULL);
9511     }
9512     else {
9513         assert(digits == NULL);
9514         assert(maxchar != NULL);
9515     }
9516     assert(0 <= d_pos);
9517     assert(0 <= n_digits);
9518     assert(grouping != NULL);
9519 
9520     if (digits != NULL) {
9521         if (PyUnicode_READY(digits) == -1) {
9522             return -1;
9523         }
9524     }
9525     if (PyUnicode_READY(thousands_sep) == -1) {
9526         return -1;
9527     }
9528 
9529     Py_ssize_t count = 0;
9530     Py_ssize_t n_zeros;
9531     int loop_broken = 0;
9532     int use_separator = 0; /* First time through, don't append the
9533                               separator. They only go between
9534                               groups. */
9535     Py_ssize_t buffer_pos;
9536     Py_ssize_t digits_pos;
9537     Py_ssize_t len;
9538     Py_ssize_t n_chars;
9539     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9540                                         be looked at */
9541     /* A generator that returns all of the grouping widths, until it
9542        returns 0. */
9543     GroupGenerator groupgen;
9544     GroupGenerator_init(&groupgen, grouping);
9545     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9546 
9547     /* if digits are not grouped, thousands separator
9548        should be an empty string */
9549     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9550 
9551     digits_pos = d_pos + n_digits;
9552     if (writer) {
9553         buffer_pos = writer->pos + n_buffer;
9554         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9555         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9556     }
9557     else {
9558         buffer_pos = n_buffer;
9559     }
9560 
9561     if (!writer) {
9562         *maxchar = 127;
9563     }
9564 
9565     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9566         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9567         n_zeros = Py_MAX(0, len - remaining);
9568         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9569 
9570         /* Use n_zero zero's and n_chars chars */
9571 
9572         /* Count only, don't do anything. */
9573         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9574 
9575         /* Copy into the writer. */
9576         InsertThousandsGrouping_fill(writer, &buffer_pos,
9577                                      digits, &digits_pos,
9578                                      n_chars, n_zeros,
9579                                      use_separator ? thousands_sep : NULL,
9580                                      thousands_sep_len, maxchar);
9581 
9582         /* Use a separator next time. */
9583         use_separator = 1;
9584 
9585         remaining -= n_chars;
9586         min_width -= len;
9587 
9588         if (remaining <= 0 && min_width <= 0) {
9589             loop_broken = 1;
9590             break;
9591         }
9592         min_width -= thousands_sep_len;
9593     }
9594     if (!loop_broken) {
9595         /* We left the loop without using a break statement. */
9596 
9597         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9598         n_zeros = Py_MAX(0, len - remaining);
9599         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9600 
9601         /* Use n_zero zero's and n_chars chars */
9602         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9603 
9604         /* Copy into the writer. */
9605         InsertThousandsGrouping_fill(writer, &buffer_pos,
9606                                      digits, &digits_pos,
9607                                      n_chars, n_zeros,
9608                                      use_separator ? thousands_sep : NULL,
9609                                      thousands_sep_len, maxchar);
9610     }
9611     return count;
9612 }
9613 
9614 
9615 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9616 PyUnicode_Count(PyObject *str,
9617                 PyObject *substr,
9618                 Py_ssize_t start,
9619                 Py_ssize_t end)
9620 {
9621     Py_ssize_t result;
9622     int kind1, kind2;
9623     const void *buf1 = NULL, *buf2 = NULL;
9624     Py_ssize_t len1, len2;
9625 
9626     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9627         return -1;
9628 
9629     kind1 = PyUnicode_KIND(str);
9630     kind2 = PyUnicode_KIND(substr);
9631     if (kind1 < kind2)
9632         return 0;
9633 
9634     len1 = PyUnicode_GET_LENGTH(str);
9635     len2 = PyUnicode_GET_LENGTH(substr);
9636     ADJUST_INDICES(start, end, len1);
9637     if (end - start < len2)
9638         return 0;
9639 
9640     buf1 = PyUnicode_DATA(str);
9641     buf2 = PyUnicode_DATA(substr);
9642     if (kind2 != kind1) {
9643         buf2 = unicode_askind(kind2, buf2, len2, kind1);
9644         if (!buf2)
9645             goto onError;
9646     }
9647 
9648     switch (kind1) {
9649     case PyUnicode_1BYTE_KIND:
9650         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9651             result = asciilib_count(
9652                 ((const Py_UCS1*)buf1) + start, end - start,
9653                 buf2, len2, PY_SSIZE_T_MAX
9654                 );
9655         else
9656             result = ucs1lib_count(
9657                 ((const Py_UCS1*)buf1) + start, end - start,
9658                 buf2, len2, PY_SSIZE_T_MAX
9659                 );
9660         break;
9661     case PyUnicode_2BYTE_KIND:
9662         result = ucs2lib_count(
9663             ((const Py_UCS2*)buf1) + start, end - start,
9664             buf2, len2, PY_SSIZE_T_MAX
9665             );
9666         break;
9667     case PyUnicode_4BYTE_KIND:
9668         result = ucs4lib_count(
9669             ((const Py_UCS4*)buf1) + start, end - start,
9670             buf2, len2, PY_SSIZE_T_MAX
9671             );
9672         break;
9673     default:
9674         Py_UNREACHABLE();
9675     }
9676 
9677     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9678     if (kind2 != kind1)
9679         PyMem_Free((void *)buf2);
9680 
9681     return result;
9682   onError:
9683     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9684     if (kind2 != kind1)
9685         PyMem_Free((void *)buf2);
9686     return -1;
9687 }
9688 
9689 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9690 PyUnicode_Find(PyObject *str,
9691                PyObject *substr,
9692                Py_ssize_t start,
9693                Py_ssize_t end,
9694                int direction)
9695 {
9696     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9697         return -2;
9698 
9699     return any_find_slice(str, substr, start, end, direction);
9700 }
9701 
9702 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9703 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9704                    Py_ssize_t start, Py_ssize_t end,
9705                    int direction)
9706 {
9707     int kind;
9708     Py_ssize_t len, result;
9709     if (PyUnicode_READY(str) == -1)
9710         return -2;
9711     len = PyUnicode_GET_LENGTH(str);
9712     ADJUST_INDICES(start, end, len);
9713     if (end - start < 1)
9714         return -1;
9715     kind = PyUnicode_KIND(str);
9716     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9717                       kind, end-start, ch, direction);
9718     if (result == -1)
9719         return -1;
9720     else
9721         return start + result;
9722 }
9723 
9724 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9725 tailmatch(PyObject *self,
9726           PyObject *substring,
9727           Py_ssize_t start,
9728           Py_ssize_t end,
9729           int direction)
9730 {
9731     int kind_self;
9732     int kind_sub;
9733     const void *data_self;
9734     const void *data_sub;
9735     Py_ssize_t offset;
9736     Py_ssize_t i;
9737     Py_ssize_t end_sub;
9738 
9739     if (PyUnicode_READY(self) == -1 ||
9740         PyUnicode_READY(substring) == -1)
9741         return -1;
9742 
9743     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9744     end -= PyUnicode_GET_LENGTH(substring);
9745     if (end < start)
9746         return 0;
9747 
9748     if (PyUnicode_GET_LENGTH(substring) == 0)
9749         return 1;
9750 
9751     kind_self = PyUnicode_KIND(self);
9752     data_self = PyUnicode_DATA(self);
9753     kind_sub = PyUnicode_KIND(substring);
9754     data_sub = PyUnicode_DATA(substring);
9755     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9756 
9757     if (direction > 0)
9758         offset = end;
9759     else
9760         offset = start;
9761 
9762     if (PyUnicode_READ(kind_self, data_self, offset) ==
9763         PyUnicode_READ(kind_sub, data_sub, 0) &&
9764         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9765         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9766         /* If both are of the same kind, memcmp is sufficient */
9767         if (kind_self == kind_sub) {
9768             return ! memcmp((char *)data_self +
9769                                 (offset * PyUnicode_KIND(substring)),
9770                             data_sub,
9771                             PyUnicode_GET_LENGTH(substring) *
9772                                 PyUnicode_KIND(substring));
9773         }
9774         /* otherwise we have to compare each character by first accessing it */
9775         else {
9776             /* We do not need to compare 0 and len(substring)-1 because
9777                the if statement above ensured already that they are equal
9778                when we end up here. */
9779             for (i = 1; i < end_sub; ++i) {
9780                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9781                     PyUnicode_READ(kind_sub, data_sub, i))
9782                     return 0;
9783             }
9784             return 1;
9785         }
9786     }
9787 
9788     return 0;
9789 }
9790 
9791 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9792 PyUnicode_Tailmatch(PyObject *str,
9793                     PyObject *substr,
9794                     Py_ssize_t start,
9795                     Py_ssize_t end,
9796                     int direction)
9797 {
9798     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9799         return -1;
9800 
9801     return tailmatch(str, substr, start, end, direction);
9802 }
9803 
9804 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9805 ascii_upper_or_lower(PyObject *self, int lower)
9806 {
9807     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9808     const char *data = PyUnicode_DATA(self);
9809     char *resdata;
9810     PyObject *res;
9811 
9812     res = PyUnicode_New(len, 127);
9813     if (res == NULL)
9814         return NULL;
9815     resdata = PyUnicode_DATA(res);
9816     if (lower)
9817         _Py_bytes_lower(resdata, data, len);
9818     else
9819         _Py_bytes_upper(resdata, data, len);
9820     return res;
9821 }
9822 
9823 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9824 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9825 {
9826     Py_ssize_t j;
9827     int final_sigma;
9828     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9829     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9830 
9831      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9832 
9833     where ! is a negation and \p{xxx} is a character with property xxx.
9834     */
9835     for (j = i - 1; j >= 0; j--) {
9836         c = PyUnicode_READ(kind, data, j);
9837         if (!_PyUnicode_IsCaseIgnorable(c))
9838             break;
9839     }
9840     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9841     if (final_sigma) {
9842         for (j = i + 1; j < length; j++) {
9843             c = PyUnicode_READ(kind, data, j);
9844             if (!_PyUnicode_IsCaseIgnorable(c))
9845                 break;
9846         }
9847         final_sigma = j == length || !_PyUnicode_IsCased(c);
9848     }
9849     return (final_sigma) ? 0x3C2 : 0x3C3;
9850 }
9851 
9852 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9853 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9854            Py_UCS4 c, Py_UCS4 *mapped)
9855 {
9856     /* Obscure special case. */
9857     if (c == 0x3A3) {
9858         mapped[0] = handle_capital_sigma(kind, data, length, i);
9859         return 1;
9860     }
9861     return _PyUnicode_ToLowerFull(c, mapped);
9862 }
9863 
9864 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9865 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9866 {
9867     Py_ssize_t i, k = 0;
9868     int n_res, j;
9869     Py_UCS4 c, mapped[3];
9870 
9871     c = PyUnicode_READ(kind, data, 0);
9872     n_res = _PyUnicode_ToTitleFull(c, mapped);
9873     for (j = 0; j < n_res; j++) {
9874         *maxchar = Py_MAX(*maxchar, mapped[j]);
9875         res[k++] = mapped[j];
9876     }
9877     for (i = 1; i < length; i++) {
9878         c = PyUnicode_READ(kind, data, i);
9879         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9880         for (j = 0; j < n_res; j++) {
9881             *maxchar = Py_MAX(*maxchar, mapped[j]);
9882             res[k++] = mapped[j];
9883         }
9884     }
9885     return k;
9886 }
9887 
9888 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9889 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9890     Py_ssize_t i, k = 0;
9891 
9892     for (i = 0; i < length; i++) {
9893         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9894         int n_res, j;
9895         if (Py_UNICODE_ISUPPER(c)) {
9896             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9897         }
9898         else if (Py_UNICODE_ISLOWER(c)) {
9899             n_res = _PyUnicode_ToUpperFull(c, mapped);
9900         }
9901         else {
9902             n_res = 1;
9903             mapped[0] = c;
9904         }
9905         for (j = 0; j < n_res; j++) {
9906             *maxchar = Py_MAX(*maxchar, mapped[j]);
9907             res[k++] = mapped[j];
9908         }
9909     }
9910     return k;
9911 }
9912 
9913 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9914 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9915                   Py_UCS4 *maxchar, int lower)
9916 {
9917     Py_ssize_t i, k = 0;
9918 
9919     for (i = 0; i < length; i++) {
9920         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921         int n_res, j;
9922         if (lower)
9923             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924         else
9925             n_res = _PyUnicode_ToUpperFull(c, mapped);
9926         for (j = 0; j < n_res; j++) {
9927             *maxchar = Py_MAX(*maxchar, mapped[j]);
9928             res[k++] = mapped[j];
9929         }
9930     }
9931     return k;
9932 }
9933 
9934 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9935 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9936 {
9937     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9938 }
9939 
9940 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9941 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9942 {
9943     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9944 }
9945 
9946 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9948 {
9949     Py_ssize_t i, k = 0;
9950 
9951     for (i = 0; i < length; i++) {
9952         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9953         Py_UCS4 mapped[3];
9954         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9955         for (j = 0; j < n_res; j++) {
9956             *maxchar = Py_MAX(*maxchar, mapped[j]);
9957             res[k++] = mapped[j];
9958         }
9959     }
9960     return k;
9961 }
9962 
9963 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9964 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9965 {
9966     Py_ssize_t i, k = 0;
9967     int previous_is_cased;
9968 
9969     previous_is_cased = 0;
9970     for (i = 0; i < length; i++) {
9971         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9972         Py_UCS4 mapped[3];
9973         int n_res, j;
9974 
9975         if (previous_is_cased)
9976             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9977         else
9978             n_res = _PyUnicode_ToTitleFull(c, mapped);
9979 
9980         for (j = 0; j < n_res; j++) {
9981             *maxchar = Py_MAX(*maxchar, mapped[j]);
9982             res[k++] = mapped[j];
9983         }
9984 
9985         previous_is_cased = _PyUnicode_IsCased(c);
9986     }
9987     return k;
9988 }
9989 
9990 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9991 case_operation(PyObject *self,
9992                Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9993 {
9994     PyObject *res = NULL;
9995     Py_ssize_t length, newlength = 0;
9996     int kind, outkind;
9997     const void *data;
9998     void *outdata;
9999     Py_UCS4 maxchar = 0, *tmp, *tmpend;
10000 
10001     assert(PyUnicode_IS_READY(self));
10002 
10003     kind = PyUnicode_KIND(self);
10004     data = PyUnicode_DATA(self);
10005     length = PyUnicode_GET_LENGTH(self);
10006     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10007         PyErr_SetString(PyExc_OverflowError, "string is too long");
10008         return NULL;
10009     }
10010     tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10011     if (tmp == NULL)
10012         return PyErr_NoMemory();
10013     newlength = perform(kind, data, length, tmp, &maxchar);
10014     res = PyUnicode_New(newlength, maxchar);
10015     if (res == NULL)
10016         goto leave;
10017     tmpend = tmp + newlength;
10018     outdata = PyUnicode_DATA(res);
10019     outkind = PyUnicode_KIND(res);
10020     switch (outkind) {
10021     case PyUnicode_1BYTE_KIND:
10022         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10023         break;
10024     case PyUnicode_2BYTE_KIND:
10025         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10026         break;
10027     case PyUnicode_4BYTE_KIND:
10028         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10029         break;
10030     default:
10031         Py_UNREACHABLE();
10032     }
10033   leave:
10034     PyMem_Free(tmp);
10035     return res;
10036 }
10037 
10038 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10039 PyUnicode_Join(PyObject *separator, PyObject *seq)
10040 {
10041     PyObject *res;
10042     PyObject *fseq;
10043     Py_ssize_t seqlen;
10044     PyObject **items;
10045 
10046     fseq = PySequence_Fast(seq, "can only join an iterable");
10047     if (fseq == NULL) {
10048         return NULL;
10049     }
10050 
10051     /* NOTE: the following code can't call back into Python code,
10052      * so we are sure that fseq won't be mutated.
10053      */
10054 
10055     items = PySequence_Fast_ITEMS(fseq);
10056     seqlen = PySequence_Fast_GET_SIZE(fseq);
10057     res = _PyUnicode_JoinArray(separator, items, seqlen);
10058     Py_DECREF(fseq);
10059     return res;
10060 }
10061 
10062 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10063 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10064 {
10065     PyObject *res = NULL; /* the result */
10066     PyObject *sep = NULL;
10067     Py_ssize_t seplen;
10068     PyObject *item;
10069     Py_ssize_t sz, i, res_offset;
10070     Py_UCS4 maxchar;
10071     Py_UCS4 item_maxchar;
10072     int use_memcpy;
10073     unsigned char *res_data = NULL, *sep_data = NULL;
10074     PyObject *last_obj;
10075     unsigned int kind = 0;
10076 
10077     /* If empty sequence, return u"". */
10078     if (seqlen == 0) {
10079         _Py_RETURN_UNICODE_EMPTY();
10080     }
10081 
10082     /* If singleton sequence with an exact Unicode, return that. */
10083     last_obj = NULL;
10084     if (seqlen == 1) {
10085         if (PyUnicode_CheckExact(items[0])) {
10086             res = items[0];
10087             Py_INCREF(res);
10088             return res;
10089         }
10090         seplen = 0;
10091         maxchar = 0;
10092     }
10093     else {
10094         /* Set up sep and seplen */
10095         if (separator == NULL) {
10096             /* fall back to a blank space separator */
10097             sep = PyUnicode_FromOrdinal(' ');
10098             if (!sep)
10099                 goto onError;
10100             seplen = 1;
10101             maxchar = 32;
10102         }
10103         else {
10104             if (!PyUnicode_Check(separator)) {
10105                 PyErr_Format(PyExc_TypeError,
10106                              "separator: expected str instance,"
10107                              " %.80s found",
10108                              Py_TYPE(separator)->tp_name);
10109                 goto onError;
10110             }
10111             if (PyUnicode_READY(separator))
10112                 goto onError;
10113             sep = separator;
10114             seplen = PyUnicode_GET_LENGTH(separator);
10115             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10116             /* inc refcount to keep this code path symmetric with the
10117                above case of a blank separator */
10118             Py_INCREF(sep);
10119         }
10120         last_obj = sep;
10121     }
10122 
10123     /* There are at least two things to join, or else we have a subclass
10124      * of str in the sequence.
10125      * Do a pre-pass to figure out the total amount of space we'll
10126      * need (sz), and see whether all argument are strings.
10127      */
10128     sz = 0;
10129 #ifdef Py_DEBUG
10130     use_memcpy = 0;
10131 #else
10132     use_memcpy = 1;
10133 #endif
10134     for (i = 0; i < seqlen; i++) {
10135         size_t add_sz;
10136         item = items[i];
10137         if (!PyUnicode_Check(item)) {
10138             PyErr_Format(PyExc_TypeError,
10139                          "sequence item %zd: expected str instance,"
10140                          " %.80s found",
10141                          i, Py_TYPE(item)->tp_name);
10142             goto onError;
10143         }
10144         if (PyUnicode_READY(item) == -1)
10145             goto onError;
10146         add_sz = PyUnicode_GET_LENGTH(item);
10147         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10148         maxchar = Py_MAX(maxchar, item_maxchar);
10149         if (i != 0) {
10150             add_sz += seplen;
10151         }
10152         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10153             PyErr_SetString(PyExc_OverflowError,
10154                             "join() result is too long for a Python string");
10155             goto onError;
10156         }
10157         sz += add_sz;
10158         if (use_memcpy && last_obj != NULL) {
10159             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10160                 use_memcpy = 0;
10161         }
10162         last_obj = item;
10163     }
10164 
10165     res = PyUnicode_New(sz, maxchar);
10166     if (res == NULL)
10167         goto onError;
10168 
10169     /* Catenate everything. */
10170 #ifdef Py_DEBUG
10171     use_memcpy = 0;
10172 #else
10173     if (use_memcpy) {
10174         res_data = PyUnicode_1BYTE_DATA(res);
10175         kind = PyUnicode_KIND(res);
10176         if (seplen != 0)
10177             sep_data = PyUnicode_1BYTE_DATA(sep);
10178     }
10179 #endif
10180     if (use_memcpy) {
10181         for (i = 0; i < seqlen; ++i) {
10182             Py_ssize_t itemlen;
10183             item = items[i];
10184 
10185             /* Copy item, and maybe the separator. */
10186             if (i && seplen != 0) {
10187                 memcpy(res_data,
10188                           sep_data,
10189                           kind * seplen);
10190                 res_data += kind * seplen;
10191             }
10192 
10193             itemlen = PyUnicode_GET_LENGTH(item);
10194             if (itemlen != 0) {
10195                 memcpy(res_data,
10196                           PyUnicode_DATA(item),
10197                           kind * itemlen);
10198                 res_data += kind * itemlen;
10199             }
10200         }
10201         assert(res_data == PyUnicode_1BYTE_DATA(res)
10202                            + kind * PyUnicode_GET_LENGTH(res));
10203     }
10204     else {
10205         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10206             Py_ssize_t itemlen;
10207             item = items[i];
10208 
10209             /* Copy item, and maybe the separator. */
10210             if (i && seplen != 0) {
10211                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10212                 res_offset += seplen;
10213             }
10214 
10215             itemlen = PyUnicode_GET_LENGTH(item);
10216             if (itemlen != 0) {
10217                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10218                 res_offset += itemlen;
10219             }
10220         }
10221         assert(res_offset == PyUnicode_GET_LENGTH(res));
10222     }
10223 
10224     Py_XDECREF(sep);
10225     assert(_PyUnicode_CheckConsistency(res, 1));
10226     return res;
10227 
10228   onError:
10229     Py_XDECREF(sep);
10230     Py_XDECREF(res);
10231     return NULL;
10232 }
10233 
10234 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10235 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10236                     Py_UCS4 fill_char)
10237 {
10238     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10239     void *data = PyUnicode_DATA(unicode);
10240     assert(PyUnicode_IS_READY(unicode));
10241     assert(unicode_modifiable(unicode));
10242     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10243     assert(start >= 0);
10244     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10245     unicode_fill(kind, data, fill_char, start, length);
10246 }
10247 
10248 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10249 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10250                Py_UCS4 fill_char)
10251 {
10252     Py_ssize_t maxlen;
10253 
10254     if (!PyUnicode_Check(unicode)) {
10255         PyErr_BadInternalCall();
10256         return -1;
10257     }
10258     if (PyUnicode_READY(unicode) == -1)
10259         return -1;
10260     if (unicode_check_modifiable(unicode))
10261         return -1;
10262 
10263     if (start < 0) {
10264         PyErr_SetString(PyExc_IndexError, "string index out of range");
10265         return -1;
10266     }
10267     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10268         PyErr_SetString(PyExc_ValueError,
10269                          "fill character is bigger than "
10270                          "the string maximum character");
10271         return -1;
10272     }
10273 
10274     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10275     length = Py_MIN(maxlen, length);
10276     if (length <= 0)
10277         return 0;
10278 
10279     _PyUnicode_FastFill(unicode, start, length, fill_char);
10280     return length;
10281 }
10282 
10283 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10284 pad(PyObject *self,
10285     Py_ssize_t left,
10286     Py_ssize_t right,
10287     Py_UCS4 fill)
10288 {
10289     PyObject *u;
10290     Py_UCS4 maxchar;
10291     int kind;
10292     void *data;
10293 
10294     if (left < 0)
10295         left = 0;
10296     if (right < 0)
10297         right = 0;
10298 
10299     if (left == 0 && right == 0)
10300         return unicode_result_unchanged(self);
10301 
10302     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10303         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10304         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10305         return NULL;
10306     }
10307     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10308     maxchar = Py_MAX(maxchar, fill);
10309     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10310     if (!u)
10311         return NULL;
10312 
10313     kind = PyUnicode_KIND(u);
10314     data = PyUnicode_DATA(u);
10315     if (left)
10316         unicode_fill(kind, data, fill, 0, left);
10317     if (right)
10318         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10319     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10320     assert(_PyUnicode_CheckConsistency(u, 1));
10321     return u;
10322 }
10323 
10324 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10325 PyUnicode_Splitlines(PyObject *string, int keepends)
10326 {
10327     PyObject *list;
10328 
10329     if (ensure_unicode(string) < 0)
10330         return NULL;
10331 
10332     switch (PyUnicode_KIND(string)) {
10333     case PyUnicode_1BYTE_KIND:
10334         if (PyUnicode_IS_ASCII(string))
10335             list = asciilib_splitlines(
10336                 string, PyUnicode_1BYTE_DATA(string),
10337                 PyUnicode_GET_LENGTH(string), keepends);
10338         else
10339             list = ucs1lib_splitlines(
10340                 string, PyUnicode_1BYTE_DATA(string),
10341                 PyUnicode_GET_LENGTH(string), keepends);
10342         break;
10343     case PyUnicode_2BYTE_KIND:
10344         list = ucs2lib_splitlines(
10345             string, PyUnicode_2BYTE_DATA(string),
10346             PyUnicode_GET_LENGTH(string), keepends);
10347         break;
10348     case PyUnicode_4BYTE_KIND:
10349         list = ucs4lib_splitlines(
10350             string, PyUnicode_4BYTE_DATA(string),
10351             PyUnicode_GET_LENGTH(string), keepends);
10352         break;
10353     default:
10354         Py_UNREACHABLE();
10355     }
10356     return list;
10357 }
10358 
10359 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10360 split(PyObject *self,
10361       PyObject *substring,
10362       Py_ssize_t maxcount)
10363 {
10364     int kind1, kind2;
10365     const void *buf1, *buf2;
10366     Py_ssize_t len1, len2;
10367     PyObject* out;
10368 
10369     if (maxcount < 0)
10370         maxcount = PY_SSIZE_T_MAX;
10371 
10372     if (PyUnicode_READY(self) == -1)
10373         return NULL;
10374 
10375     if (substring == NULL)
10376         switch (PyUnicode_KIND(self)) {
10377         case PyUnicode_1BYTE_KIND:
10378             if (PyUnicode_IS_ASCII(self))
10379                 return asciilib_split_whitespace(
10380                     self,  PyUnicode_1BYTE_DATA(self),
10381                     PyUnicode_GET_LENGTH(self), maxcount
10382                     );
10383             else
10384                 return ucs1lib_split_whitespace(
10385                     self,  PyUnicode_1BYTE_DATA(self),
10386                     PyUnicode_GET_LENGTH(self), maxcount
10387                     );
10388         case PyUnicode_2BYTE_KIND:
10389             return ucs2lib_split_whitespace(
10390                 self,  PyUnicode_2BYTE_DATA(self),
10391                 PyUnicode_GET_LENGTH(self), maxcount
10392                 );
10393         case PyUnicode_4BYTE_KIND:
10394             return ucs4lib_split_whitespace(
10395                 self,  PyUnicode_4BYTE_DATA(self),
10396                 PyUnicode_GET_LENGTH(self), maxcount
10397                 );
10398         default:
10399             Py_UNREACHABLE();
10400         }
10401 
10402     if (PyUnicode_READY(substring) == -1)
10403         return NULL;
10404 
10405     kind1 = PyUnicode_KIND(self);
10406     kind2 = PyUnicode_KIND(substring);
10407     len1 = PyUnicode_GET_LENGTH(self);
10408     len2 = PyUnicode_GET_LENGTH(substring);
10409     if (kind1 < kind2 || len1 < len2) {
10410         out = PyList_New(1);
10411         if (out == NULL)
10412             return NULL;
10413         Py_INCREF(self);
10414         PyList_SET_ITEM(out, 0, self);
10415         return out;
10416     }
10417     buf1 = PyUnicode_DATA(self);
10418     buf2 = PyUnicode_DATA(substring);
10419     if (kind2 != kind1) {
10420         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421         if (!buf2)
10422             return NULL;
10423     }
10424 
10425     switch (kind1) {
10426     case PyUnicode_1BYTE_KIND:
10427         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428             out = asciilib_split(
10429                 self,  buf1, len1, buf2, len2, maxcount);
10430         else
10431             out = ucs1lib_split(
10432                 self,  buf1, len1, buf2, len2, maxcount);
10433         break;
10434     case PyUnicode_2BYTE_KIND:
10435         out = ucs2lib_split(
10436             self,  buf1, len1, buf2, len2, maxcount);
10437         break;
10438     case PyUnicode_4BYTE_KIND:
10439         out = ucs4lib_split(
10440             self,  buf1, len1, buf2, len2, maxcount);
10441         break;
10442     default:
10443         out = NULL;
10444     }
10445     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446     if (kind2 != kind1)
10447         PyMem_Free((void *)buf2);
10448     return out;
10449 }
10450 
10451 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10452 rsplit(PyObject *self,
10453        PyObject *substring,
10454        Py_ssize_t maxcount)
10455 {
10456     int kind1, kind2;
10457     const void *buf1, *buf2;
10458     Py_ssize_t len1, len2;
10459     PyObject* out;
10460 
10461     if (maxcount < 0)
10462         maxcount = PY_SSIZE_T_MAX;
10463 
10464     if (PyUnicode_READY(self) == -1)
10465         return NULL;
10466 
10467     if (substring == NULL)
10468         switch (PyUnicode_KIND(self)) {
10469         case PyUnicode_1BYTE_KIND:
10470             if (PyUnicode_IS_ASCII(self))
10471                 return asciilib_rsplit_whitespace(
10472                     self,  PyUnicode_1BYTE_DATA(self),
10473                     PyUnicode_GET_LENGTH(self), maxcount
10474                     );
10475             else
10476                 return ucs1lib_rsplit_whitespace(
10477                     self,  PyUnicode_1BYTE_DATA(self),
10478                     PyUnicode_GET_LENGTH(self), maxcount
10479                     );
10480         case PyUnicode_2BYTE_KIND:
10481             return ucs2lib_rsplit_whitespace(
10482                 self,  PyUnicode_2BYTE_DATA(self),
10483                 PyUnicode_GET_LENGTH(self), maxcount
10484                 );
10485         case PyUnicode_4BYTE_KIND:
10486             return ucs4lib_rsplit_whitespace(
10487                 self,  PyUnicode_4BYTE_DATA(self),
10488                 PyUnicode_GET_LENGTH(self), maxcount
10489                 );
10490         default:
10491             Py_UNREACHABLE();
10492         }
10493 
10494     if (PyUnicode_READY(substring) == -1)
10495         return NULL;
10496 
10497     kind1 = PyUnicode_KIND(self);
10498     kind2 = PyUnicode_KIND(substring);
10499     len1 = PyUnicode_GET_LENGTH(self);
10500     len2 = PyUnicode_GET_LENGTH(substring);
10501     if (kind1 < kind2 || len1 < len2) {
10502         out = PyList_New(1);
10503         if (out == NULL)
10504             return NULL;
10505         Py_INCREF(self);
10506         PyList_SET_ITEM(out, 0, self);
10507         return out;
10508     }
10509     buf1 = PyUnicode_DATA(self);
10510     buf2 = PyUnicode_DATA(substring);
10511     if (kind2 != kind1) {
10512         buf2 = unicode_askind(kind2, buf2, len2, kind1);
10513         if (!buf2)
10514             return NULL;
10515     }
10516 
10517     switch (kind1) {
10518     case PyUnicode_1BYTE_KIND:
10519         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10520             out = asciilib_rsplit(
10521                 self,  buf1, len1, buf2, len2, maxcount);
10522         else
10523             out = ucs1lib_rsplit(
10524                 self,  buf1, len1, buf2, len2, maxcount);
10525         break;
10526     case PyUnicode_2BYTE_KIND:
10527         out = ucs2lib_rsplit(
10528             self,  buf1, len1, buf2, len2, maxcount);
10529         break;
10530     case PyUnicode_4BYTE_KIND:
10531         out = ucs4lib_rsplit(
10532             self,  buf1, len1, buf2, len2, maxcount);
10533         break;
10534     default:
10535         out = NULL;
10536     }
10537     assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10538     if (kind2 != kind1)
10539         PyMem_Free((void *)buf2);
10540     return out;
10541 }
10542 
10543 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10544 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10545             PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10546 {
10547     switch (kind) {
10548     case PyUnicode_1BYTE_KIND:
10549         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10550             return asciilib_find(buf1, len1, buf2, len2, offset);
10551         else
10552             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10553     case PyUnicode_2BYTE_KIND:
10554         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10555     case PyUnicode_4BYTE_KIND:
10556         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10557     }
10558     Py_UNREACHABLE();
10559 }
10560 
10561 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10562 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10563              PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10564 {
10565     switch (kind) {
10566     case PyUnicode_1BYTE_KIND:
10567         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10568             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10569         else
10570             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10571     case PyUnicode_2BYTE_KIND:
10572         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10573     case PyUnicode_4BYTE_KIND:
10574         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10575     }
10576     Py_UNREACHABLE();
10577 }
10578 
10579 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10580 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10581                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10582 {
10583     int kind = PyUnicode_KIND(u);
10584     void *data = PyUnicode_DATA(u);
10585     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10586     if (kind == PyUnicode_1BYTE_KIND) {
10587         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10588                                       (Py_UCS1 *)data + len,
10589                                       u1, u2, maxcount);
10590     }
10591     else if (kind == PyUnicode_2BYTE_KIND) {
10592         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10593                                       (Py_UCS2 *)data + len,
10594                                       u1, u2, maxcount);
10595     }
10596     else {
10597         assert(kind == PyUnicode_4BYTE_KIND);
10598         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10599                                       (Py_UCS4 *)data + len,
10600                                       u1, u2, maxcount);
10601     }
10602 }
10603 
10604 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10605 replace(PyObject *self, PyObject *str1,
10606         PyObject *str2, Py_ssize_t maxcount)
10607 {
10608     PyObject *u;
10609     const char *sbuf = PyUnicode_DATA(self);
10610     const void *buf1 = PyUnicode_DATA(str1);
10611     const void *buf2 = PyUnicode_DATA(str2);
10612     int srelease = 0, release1 = 0, release2 = 0;
10613     int skind = PyUnicode_KIND(self);
10614     int kind1 = PyUnicode_KIND(str1);
10615     int kind2 = PyUnicode_KIND(str2);
10616     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10617     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10618     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10619     int mayshrink;
10620     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10621 
10622     if (slen < len1)
10623         goto nothing;
10624 
10625     if (maxcount < 0)
10626         maxcount = PY_SSIZE_T_MAX;
10627     else if (maxcount == 0)
10628         goto nothing;
10629 
10630     if (str1 == str2)
10631         goto nothing;
10632 
10633     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10634     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10635     if (maxchar < maxchar_str1)
10636         /* substring too wide to be present */
10637         goto nothing;
10638     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10639     /* Replacing str1 with str2 may cause a maxchar reduction in the
10640        result string. */
10641     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10642     maxchar = Py_MAX(maxchar, maxchar_str2);
10643 
10644     if (len1 == len2) {
10645         /* same length */
10646         if (len1 == 0)
10647             goto nothing;
10648         if (len1 == 1) {
10649             /* replace characters */
10650             Py_UCS4 u1, u2;
10651             Py_ssize_t pos;
10652 
10653             u1 = PyUnicode_READ(kind1, buf1, 0);
10654             pos = findchar(sbuf, skind, slen, u1, 1);
10655             if (pos < 0)
10656                 goto nothing;
10657             u2 = PyUnicode_READ(kind2, buf2, 0);
10658             u = PyUnicode_New(slen, maxchar);
10659             if (!u)
10660                 goto error;
10661 
10662             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10663             replace_1char_inplace(u, pos, u1, u2, maxcount);
10664         }
10665         else {
10666             int rkind = skind;
10667             char *res;
10668             Py_ssize_t i;
10669 
10670             if (kind1 < rkind) {
10671                 /* widen substring */
10672                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10673                 if (!buf1) goto error;
10674                 release1 = 1;
10675             }
10676             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10677             if (i < 0)
10678                 goto nothing;
10679             if (rkind > kind2) {
10680                 /* widen replacement */
10681                 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10682                 if (!buf2) goto error;
10683                 release2 = 1;
10684             }
10685             else if (rkind < kind2) {
10686                 /* widen self and buf1 */
10687                 rkind = kind2;
10688                 if (release1) {
10689                     assert(buf1 != PyUnicode_DATA(str1));
10690                     PyMem_Free((void *)buf1);
10691                     buf1 = PyUnicode_DATA(str1);
10692                     release1 = 0;
10693                 }
10694                 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10695                 if (!sbuf) goto error;
10696                 srelease = 1;
10697                 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10698                 if (!buf1) goto error;
10699                 release1 = 1;
10700             }
10701             u = PyUnicode_New(slen, maxchar);
10702             if (!u)
10703                 goto error;
10704             assert(PyUnicode_KIND(u) == rkind);
10705             res = PyUnicode_DATA(u);
10706 
10707             memcpy(res, sbuf, rkind * slen);
10708             /* change everything in-place, starting with this one */
10709             memcpy(res + rkind * i,
10710                    buf2,
10711                    rkind * len2);
10712             i += len1;
10713 
10714             while ( --maxcount > 0) {
10715                 i = anylib_find(rkind, self,
10716                                 sbuf+rkind*i, slen-i,
10717                                 str1, buf1, len1, i);
10718                 if (i == -1)
10719                     break;
10720                 memcpy(res + rkind * i,
10721                        buf2,
10722                        rkind * len2);
10723                 i += len1;
10724             }
10725         }
10726     }
10727     else {
10728         Py_ssize_t n, i, j, ires;
10729         Py_ssize_t new_size;
10730         int rkind = skind;
10731         char *res;
10732 
10733         if (kind1 < rkind) {
10734             /* widen substring */
10735             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10736             if (!buf1) goto error;
10737             release1 = 1;
10738         }
10739         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10740         if (n == 0)
10741             goto nothing;
10742         if (kind2 < rkind) {
10743             /* widen replacement */
10744             buf2 = unicode_askind(kind2, buf2, len2, rkind);
10745             if (!buf2) goto error;
10746             release2 = 1;
10747         }
10748         else if (kind2 > rkind) {
10749             /* widen self and buf1 */
10750             rkind = kind2;
10751             sbuf = unicode_askind(skind, sbuf, slen, rkind);
10752             if (!sbuf) goto error;
10753             srelease = 1;
10754             if (release1) {
10755                 assert(buf1 != PyUnicode_DATA(str1));
10756                 PyMem_Free((void *)buf1);
10757                 buf1 = PyUnicode_DATA(str1);
10758                 release1 = 0;
10759             }
10760             buf1 = unicode_askind(kind1, buf1, len1, rkind);
10761             if (!buf1) goto error;
10762             release1 = 1;
10763         }
10764         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10765            PyUnicode_GET_LENGTH(str1)); */
10766         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10767                 PyErr_SetString(PyExc_OverflowError,
10768                                 "replace string is too long");
10769                 goto error;
10770         }
10771         new_size = slen + n * (len2 - len1);
10772         if (new_size == 0) {
10773             u = unicode_new_empty();
10774             goto done;
10775         }
10776         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10777             PyErr_SetString(PyExc_OverflowError,
10778                             "replace string is too long");
10779             goto error;
10780         }
10781         u = PyUnicode_New(new_size, maxchar);
10782         if (!u)
10783             goto error;
10784         assert(PyUnicode_KIND(u) == rkind);
10785         res = PyUnicode_DATA(u);
10786         ires = i = 0;
10787         if (len1 > 0) {
10788             while (n-- > 0) {
10789                 /* look for next match */
10790                 j = anylib_find(rkind, self,
10791                                 sbuf + rkind * i, slen-i,
10792                                 str1, buf1, len1, i);
10793                 if (j == -1)
10794                     break;
10795                 else if (j > i) {
10796                     /* copy unchanged part [i:j] */
10797                     memcpy(res + rkind * ires,
10798                            sbuf + rkind * i,
10799                            rkind * (j-i));
10800                     ires += j - i;
10801                 }
10802                 /* copy substitution string */
10803                 if (len2 > 0) {
10804                     memcpy(res + rkind * ires,
10805                            buf2,
10806                            rkind * len2);
10807                     ires += len2;
10808                 }
10809                 i = j + len1;
10810             }
10811             if (i < slen)
10812                 /* copy tail [i:] */
10813                 memcpy(res + rkind * ires,
10814                        sbuf + rkind * i,
10815                        rkind * (slen-i));
10816         }
10817         else {
10818             /* interleave */
10819             while (n > 0) {
10820                 memcpy(res + rkind * ires,
10821                        buf2,
10822                        rkind * len2);
10823                 ires += len2;
10824                 if (--n <= 0)
10825                     break;
10826                 memcpy(res + rkind * ires,
10827                        sbuf + rkind * i,
10828                        rkind);
10829                 ires++;
10830                 i++;
10831             }
10832             memcpy(res + rkind * ires,
10833                    sbuf + rkind * i,
10834                    rkind * (slen-i));
10835         }
10836     }
10837 
10838     if (mayshrink) {
10839         unicode_adjust_maxchar(&u);
10840         if (u == NULL)
10841             goto error;
10842     }
10843 
10844   done:
10845     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10846     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10847     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10848     if (srelease)
10849         PyMem_Free((void *)sbuf);
10850     if (release1)
10851         PyMem_Free((void *)buf1);
10852     if (release2)
10853         PyMem_Free((void *)buf2);
10854     assert(_PyUnicode_CheckConsistency(u, 1));
10855     return u;
10856 
10857   nothing:
10858     /* nothing to replace; return original string (when possible) */
10859     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10860     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10861     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10862     if (srelease)
10863         PyMem_Free((void *)sbuf);
10864     if (release1)
10865         PyMem_Free((void *)buf1);
10866     if (release2)
10867         PyMem_Free((void *)buf2);
10868     return unicode_result_unchanged(self);
10869 
10870   error:
10871     assert(srelease == (sbuf != PyUnicode_DATA(self)));
10872     assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10873     assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10874     if (srelease)
10875         PyMem_Free((void *)sbuf);
10876     if (release1)
10877         PyMem_Free((void *)buf1);
10878     if (release2)
10879         PyMem_Free((void *)buf2);
10880     return NULL;
10881 }
10882 
10883 /* --- Unicode Object Methods --------------------------------------------- */
10884 
10885 /*[clinic input]
10886 str.title as unicode_title
10887 
10888 Return a version of the string where each word is titlecased.
10889 
10890 More specifically, words start with uppercased characters and all remaining
10891 cased characters have lower case.
10892 [clinic start generated code]*/
10893 
10894 static PyObject *
unicode_title_impl(PyObject * self)10895 unicode_title_impl(PyObject *self)
10896 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10897 {
10898     if (PyUnicode_READY(self) == -1)
10899         return NULL;
10900     return case_operation(self, do_title);
10901 }
10902 
10903 /*[clinic input]
10904 str.capitalize as unicode_capitalize
10905 
10906 Return a capitalized version of the string.
10907 
10908 More specifically, make the first character have upper case and the rest lower
10909 case.
10910 [clinic start generated code]*/
10911 
10912 static PyObject *
unicode_capitalize_impl(PyObject * self)10913 unicode_capitalize_impl(PyObject *self)
10914 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10915 {
10916     if (PyUnicode_READY(self) == -1)
10917         return NULL;
10918     if (PyUnicode_GET_LENGTH(self) == 0)
10919         return unicode_result_unchanged(self);
10920     return case_operation(self, do_capitalize);
10921 }
10922 
10923 /*[clinic input]
10924 str.casefold as unicode_casefold
10925 
10926 Return a version of the string suitable for caseless comparisons.
10927 [clinic start generated code]*/
10928 
10929 static PyObject *
unicode_casefold_impl(PyObject * self)10930 unicode_casefold_impl(PyObject *self)
10931 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10932 {
10933     if (PyUnicode_READY(self) == -1)
10934         return NULL;
10935     if (PyUnicode_IS_ASCII(self))
10936         return ascii_upper_or_lower(self, 1);
10937     return case_operation(self, do_casefold);
10938 }
10939 
10940 
10941 /* Argument converter. Accepts a single Unicode character. */
10942 
10943 static int
convert_uc(PyObject * obj,void * addr)10944 convert_uc(PyObject *obj, void *addr)
10945 {
10946     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10947 
10948     if (!PyUnicode_Check(obj)) {
10949         PyErr_Format(PyExc_TypeError,
10950                      "The fill character must be a unicode character, "
10951                      "not %.100s", Py_TYPE(obj)->tp_name);
10952         return 0;
10953     }
10954     if (PyUnicode_READY(obj) < 0)
10955         return 0;
10956     if (PyUnicode_GET_LENGTH(obj) != 1) {
10957         PyErr_SetString(PyExc_TypeError,
10958                         "The fill character must be exactly one character long");
10959         return 0;
10960     }
10961     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10962     return 1;
10963 }
10964 
10965 /*[clinic input]
10966 str.center as unicode_center
10967 
10968     width: Py_ssize_t
10969     fillchar: Py_UCS4 = ' '
10970     /
10971 
10972 Return a centered string of length width.
10973 
10974 Padding is done using the specified fill character (default is a space).
10975 [clinic start generated code]*/
10976 
10977 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10978 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10979 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10980 {
10981     Py_ssize_t marg, left;
10982 
10983     if (PyUnicode_READY(self) == -1)
10984         return NULL;
10985 
10986     if (PyUnicode_GET_LENGTH(self) >= width)
10987         return unicode_result_unchanged(self);
10988 
10989     marg = width - PyUnicode_GET_LENGTH(self);
10990     left = marg / 2 + (marg & width & 1);
10991 
10992     return pad(self, left, marg - left, fillchar);
10993 }
10994 
10995 /* This function assumes that str1 and str2 are readied by the caller. */
10996 
10997 static int
unicode_compare(PyObject * str1,PyObject * str2)10998 unicode_compare(PyObject *str1, PyObject *str2)
10999 {
11000 #define COMPARE(TYPE1, TYPE2) \
11001     do { \
11002         TYPE1* p1 = (TYPE1 *)data1; \
11003         TYPE2* p2 = (TYPE2 *)data2; \
11004         TYPE1* end = p1 + len; \
11005         Py_UCS4 c1, c2; \
11006         for (; p1 != end; p1++, p2++) { \
11007             c1 = *p1; \
11008             c2 = *p2; \
11009             if (c1 != c2) \
11010                 return (c1 < c2) ? -1 : 1; \
11011         } \
11012     } \
11013     while (0)
11014 
11015     int kind1, kind2;
11016     const void *data1, *data2;
11017     Py_ssize_t len1, len2, len;
11018 
11019     kind1 = PyUnicode_KIND(str1);
11020     kind2 = PyUnicode_KIND(str2);
11021     data1 = PyUnicode_DATA(str1);
11022     data2 = PyUnicode_DATA(str2);
11023     len1 = PyUnicode_GET_LENGTH(str1);
11024     len2 = PyUnicode_GET_LENGTH(str2);
11025     len = Py_MIN(len1, len2);
11026 
11027     switch(kind1) {
11028     case PyUnicode_1BYTE_KIND:
11029     {
11030         switch(kind2) {
11031         case PyUnicode_1BYTE_KIND:
11032         {
11033             int cmp = memcmp(data1, data2, len);
11034             /* normalize result of memcmp() into the range [-1; 1] */
11035             if (cmp < 0)
11036                 return -1;
11037             if (cmp > 0)
11038                 return 1;
11039             break;
11040         }
11041         case PyUnicode_2BYTE_KIND:
11042             COMPARE(Py_UCS1, Py_UCS2);
11043             break;
11044         case PyUnicode_4BYTE_KIND:
11045             COMPARE(Py_UCS1, Py_UCS4);
11046             break;
11047         default:
11048             Py_UNREACHABLE();
11049         }
11050         break;
11051     }
11052     case PyUnicode_2BYTE_KIND:
11053     {
11054         switch(kind2) {
11055         case PyUnicode_1BYTE_KIND:
11056             COMPARE(Py_UCS2, Py_UCS1);
11057             break;
11058         case PyUnicode_2BYTE_KIND:
11059         {
11060             COMPARE(Py_UCS2, Py_UCS2);
11061             break;
11062         }
11063         case PyUnicode_4BYTE_KIND:
11064             COMPARE(Py_UCS2, Py_UCS4);
11065             break;
11066         default:
11067             Py_UNREACHABLE();
11068         }
11069         break;
11070     }
11071     case PyUnicode_4BYTE_KIND:
11072     {
11073         switch(kind2) {
11074         case PyUnicode_1BYTE_KIND:
11075             COMPARE(Py_UCS4, Py_UCS1);
11076             break;
11077         case PyUnicode_2BYTE_KIND:
11078             COMPARE(Py_UCS4, Py_UCS2);
11079             break;
11080         case PyUnicode_4BYTE_KIND:
11081         {
11082 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11083             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11084             /* normalize result of wmemcmp() into the range [-1; 1] */
11085             if (cmp < 0)
11086                 return -1;
11087             if (cmp > 0)
11088                 return 1;
11089 #else
11090             COMPARE(Py_UCS4, Py_UCS4);
11091 #endif
11092             break;
11093         }
11094         default:
11095             Py_UNREACHABLE();
11096         }
11097         break;
11098     }
11099     default:
11100         Py_UNREACHABLE();
11101     }
11102 
11103     if (len1 == len2)
11104         return 0;
11105     if (len1 < len2)
11106         return -1;
11107     else
11108         return 1;
11109 
11110 #undef COMPARE
11111 }
11112 
11113 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11114 unicode_compare_eq(PyObject *str1, PyObject *str2)
11115 {
11116     int kind;
11117     const void *data1, *data2;
11118     Py_ssize_t len;
11119     int cmp;
11120 
11121     len = PyUnicode_GET_LENGTH(str1);
11122     if (PyUnicode_GET_LENGTH(str2) != len)
11123         return 0;
11124     kind = PyUnicode_KIND(str1);
11125     if (PyUnicode_KIND(str2) != kind)
11126         return 0;
11127     data1 = PyUnicode_DATA(str1);
11128     data2 = PyUnicode_DATA(str2);
11129 
11130     cmp = memcmp(data1, data2, len * kind);
11131     return (cmp == 0);
11132 }
11133 
11134 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)11135 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
11136 {
11137     assert(PyUnicode_Check(str1));
11138     assert(PyUnicode_Check(str2));
11139     if (str1 == str2) {
11140         return 1;
11141     }
11142     if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11143         return -1;
11144     }
11145     return unicode_compare_eq(str1, str2);
11146 }
11147 
11148 
11149 int
PyUnicode_Compare(PyObject * left,PyObject * right)11150 PyUnicode_Compare(PyObject *left, PyObject *right)
11151 {
11152     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11153         if (PyUnicode_READY(left) == -1 ||
11154             PyUnicode_READY(right) == -1)
11155             return -1;
11156 
11157         /* a string is equal to itself */
11158         if (left == right)
11159             return 0;
11160 
11161         return unicode_compare(left, right);
11162     }
11163     PyErr_Format(PyExc_TypeError,
11164                  "Can't compare %.100s and %.100s",
11165                  Py_TYPE(left)->tp_name,
11166                  Py_TYPE(right)->tp_name);
11167     return -1;
11168 }
11169 
11170 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11171 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11172 {
11173     Py_ssize_t i;
11174     int kind;
11175     Py_UCS4 chr;
11176     const unsigned char *ustr = (const unsigned char *)str;
11177 
11178     assert(_PyUnicode_CHECK(uni));
11179     if (!PyUnicode_IS_READY(uni)) {
11180         const wchar_t *ws = _PyUnicode_WSTR(uni);
11181         /* Compare Unicode string and source character set string */
11182         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11183             if (chr != ustr[i])
11184                 return (chr < ustr[i]) ? -1 : 1;
11185         }
11186         /* This check keeps Python strings that end in '\0' from comparing equal
11187          to C strings identical up to that point. */
11188         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11189             return 1; /* uni is longer */
11190         if (ustr[i])
11191             return -1; /* str is longer */
11192         return 0;
11193     }
11194     kind = PyUnicode_KIND(uni);
11195     if (kind == PyUnicode_1BYTE_KIND) {
11196         const void *data = PyUnicode_1BYTE_DATA(uni);
11197         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11198         size_t len, len2 = strlen(str);
11199         int cmp;
11200 
11201         len = Py_MIN(len1, len2);
11202         cmp = memcmp(data, str, len);
11203         if (cmp != 0) {
11204             if (cmp < 0)
11205                 return -1;
11206             else
11207                 return 1;
11208         }
11209         if (len1 > len2)
11210             return 1; /* uni is longer */
11211         if (len1 < len2)
11212             return -1; /* str is longer */
11213         return 0;
11214     }
11215     else {
11216         const void *data = PyUnicode_DATA(uni);
11217         /* Compare Unicode string and source character set string */
11218         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11219             if (chr != (unsigned char)str[i])
11220                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11221         /* This check keeps Python strings that end in '\0' from comparing equal
11222          to C strings identical up to that point. */
11223         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11224             return 1; /* uni is longer */
11225         if (str[i])
11226             return -1; /* str is longer */
11227         return 0;
11228     }
11229 }
11230 
11231 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11232 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11233 {
11234     size_t i, len;
11235     const wchar_t *p;
11236     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11237     if (strlen(str) != len)
11238         return 0;
11239     p = _PyUnicode_WSTR(unicode);
11240     assert(p);
11241     for (i = 0; i < len; i++) {
11242         unsigned char c = (unsigned char)str[i];
11243         if (c >= 128 || p[i] != (wchar_t)c)
11244             return 0;
11245     }
11246     return 1;
11247 }
11248 
11249 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11250 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11251 {
11252     size_t len;
11253     assert(_PyUnicode_CHECK(unicode));
11254     assert(str);
11255 #ifndef NDEBUG
11256     for (const char *p = str; *p; p++) {
11257         assert((unsigned char)*p < 128);
11258     }
11259 #endif
11260     if (PyUnicode_READY(unicode) == -1) {
11261         /* Memory error or bad data */
11262         PyErr_Clear();
11263         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11264     }
11265     if (!PyUnicode_IS_ASCII(unicode))
11266         return 0;
11267     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11268     return strlen(str) == len &&
11269            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11270 }
11271 
11272 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11273 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11274 {
11275     PyObject *right_uni;
11276 
11277     assert(_PyUnicode_CHECK(left));
11278     assert(right->string);
11279 #ifndef NDEBUG
11280     for (const char *p = right->string; *p; p++) {
11281         assert((unsigned char)*p < 128);
11282     }
11283 #endif
11284 
11285     if (PyUnicode_READY(left) == -1) {
11286         /* memory error or bad data */
11287         PyErr_Clear();
11288         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11289     }
11290 
11291     if (!PyUnicode_IS_ASCII(left))
11292         return 0;
11293 
11294     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11295     if (right_uni == NULL) {
11296         /* memory error or bad data */
11297         PyErr_Clear();
11298         return _PyUnicode_EqualToASCIIString(left, right->string);
11299     }
11300 
11301     if (left == right_uni)
11302         return 1;
11303 
11304     if (PyUnicode_CHECK_INTERNED(left))
11305         return 0;
11306 
11307     assert(_PyUnicode_HASH(right_uni) != -1);
11308     Py_hash_t hash = _PyUnicode_HASH(left);
11309     if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11310         return 0;
11311     }
11312 
11313     return unicode_compare_eq(left, right_uni);
11314 }
11315 
11316 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11317 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11318 {
11319     int result;
11320 
11321     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11322         Py_RETURN_NOTIMPLEMENTED;
11323 
11324     if (PyUnicode_READY(left) == -1 ||
11325         PyUnicode_READY(right) == -1)
11326         return NULL;
11327 
11328     if (left == right) {
11329         switch (op) {
11330         case Py_EQ:
11331         case Py_LE:
11332         case Py_GE:
11333             /* a string is equal to itself */
11334             Py_RETURN_TRUE;
11335         case Py_NE:
11336         case Py_LT:
11337         case Py_GT:
11338             Py_RETURN_FALSE;
11339         default:
11340             PyErr_BadArgument();
11341             return NULL;
11342         }
11343     }
11344     else if (op == Py_EQ || op == Py_NE) {
11345         result = unicode_compare_eq(left, right);
11346         result ^= (op == Py_NE);
11347         return PyBool_FromLong(result);
11348     }
11349     else {
11350         result = unicode_compare(left, right);
11351         Py_RETURN_RICHCOMPARE(result, 0, op);
11352     }
11353 }
11354 
11355 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11356 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11357 {
11358     return unicode_eq(aa, bb);
11359 }
11360 
11361 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11362 PyUnicode_Contains(PyObject *str, PyObject *substr)
11363 {
11364     int kind1, kind2;
11365     const void *buf1, *buf2;
11366     Py_ssize_t len1, len2;
11367     int result;
11368 
11369     if (!PyUnicode_Check(substr)) {
11370         PyErr_Format(PyExc_TypeError,
11371                      "'in <string>' requires string as left operand, not %.100s",
11372                      Py_TYPE(substr)->tp_name);
11373         return -1;
11374     }
11375     if (PyUnicode_READY(substr) == -1)
11376         return -1;
11377     if (ensure_unicode(str) < 0)
11378         return -1;
11379 
11380     kind1 = PyUnicode_KIND(str);
11381     kind2 = PyUnicode_KIND(substr);
11382     if (kind1 < kind2)
11383         return 0;
11384     len1 = PyUnicode_GET_LENGTH(str);
11385     len2 = PyUnicode_GET_LENGTH(substr);
11386     if (len1 < len2)
11387         return 0;
11388     buf1 = PyUnicode_DATA(str);
11389     buf2 = PyUnicode_DATA(substr);
11390     if (len2 == 1) {
11391         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11392         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11393         return result;
11394     }
11395     if (kind2 != kind1) {
11396         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11397         if (!buf2)
11398             return -1;
11399     }
11400 
11401     switch (kind1) {
11402     case PyUnicode_1BYTE_KIND:
11403         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11404         break;
11405     case PyUnicode_2BYTE_KIND:
11406         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11407         break;
11408     case PyUnicode_4BYTE_KIND:
11409         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11410         break;
11411     default:
11412         Py_UNREACHABLE();
11413     }
11414 
11415     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11416     if (kind2 != kind1)
11417         PyMem_Free((void *)buf2);
11418 
11419     return result;
11420 }
11421 
11422 /* Concat to string or Unicode object giving a new Unicode object. */
11423 
11424 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11425 PyUnicode_Concat(PyObject *left, PyObject *right)
11426 {
11427     PyObject *result;
11428     Py_UCS4 maxchar, maxchar2;
11429     Py_ssize_t left_len, right_len, new_len;
11430 
11431     if (ensure_unicode(left) < 0)
11432         return NULL;
11433 
11434     if (!PyUnicode_Check(right)) {
11435         PyErr_Format(PyExc_TypeError,
11436                      "can only concatenate str (not \"%.200s\") to str",
11437                      Py_TYPE(right)->tp_name);
11438         return NULL;
11439     }
11440     if (PyUnicode_READY(right) < 0)
11441         return NULL;
11442 
11443     /* Shortcuts */
11444     PyObject *empty = unicode_get_empty();  // Borrowed reference
11445     if (left == empty) {
11446         return PyUnicode_FromObject(right);
11447     }
11448     if (right == empty) {
11449         return PyUnicode_FromObject(left);
11450     }
11451 
11452     left_len = PyUnicode_GET_LENGTH(left);
11453     right_len = PyUnicode_GET_LENGTH(right);
11454     if (left_len > PY_SSIZE_T_MAX - right_len) {
11455         PyErr_SetString(PyExc_OverflowError,
11456                         "strings are too large to concat");
11457         return NULL;
11458     }
11459     new_len = left_len + right_len;
11460 
11461     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11462     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11463     maxchar = Py_MAX(maxchar, maxchar2);
11464 
11465     /* Concat the two Unicode strings */
11466     result = PyUnicode_New(new_len, maxchar);
11467     if (result == NULL)
11468         return NULL;
11469     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11470     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11471     assert(_PyUnicode_CheckConsistency(result, 1));
11472     return result;
11473 }
11474 
11475 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11476 PyUnicode_Append(PyObject **p_left, PyObject *right)
11477 {
11478     PyObject *left, *res;
11479     Py_UCS4 maxchar, maxchar2;
11480     Py_ssize_t left_len, right_len, new_len;
11481 
11482     if (p_left == NULL) {
11483         if (!PyErr_Occurred())
11484             PyErr_BadInternalCall();
11485         return;
11486     }
11487     left = *p_left;
11488     if (right == NULL || left == NULL
11489         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11490         if (!PyErr_Occurred())
11491             PyErr_BadInternalCall();
11492         goto error;
11493     }
11494 
11495     if (PyUnicode_READY(left) == -1)
11496         goto error;
11497     if (PyUnicode_READY(right) == -1)
11498         goto error;
11499 
11500     /* Shortcuts */
11501     PyObject *empty = unicode_get_empty();  // Borrowed reference
11502     if (left == empty) {
11503         Py_DECREF(left);
11504         Py_INCREF(right);
11505         *p_left = right;
11506         return;
11507     }
11508     if (right == empty) {
11509         return;
11510     }
11511 
11512     left_len = PyUnicode_GET_LENGTH(left);
11513     right_len = PyUnicode_GET_LENGTH(right);
11514     if (left_len > PY_SSIZE_T_MAX - right_len) {
11515         PyErr_SetString(PyExc_OverflowError,
11516                         "strings are too large to concat");
11517         goto error;
11518     }
11519     new_len = left_len + right_len;
11520 
11521     if (unicode_modifiable(left)
11522         && PyUnicode_CheckExact(right)
11523         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11524         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11525            to change the structure size, but characters are stored just after
11526            the structure, and so it requires to move all characters which is
11527            not so different than duplicating the string. */
11528         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11529     {
11530         /* append inplace */
11531         if (unicode_resize(p_left, new_len) != 0)
11532             goto error;
11533 
11534         /* copy 'right' into the newly allocated area of 'left' */
11535         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11536     }
11537     else {
11538         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11539         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11540         maxchar = Py_MAX(maxchar, maxchar2);
11541 
11542         /* Concat the two Unicode strings */
11543         res = PyUnicode_New(new_len, maxchar);
11544         if (res == NULL)
11545             goto error;
11546         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11547         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11548         Py_DECREF(left);
11549         *p_left = res;
11550     }
11551     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11552     return;
11553 
11554 error:
11555     Py_CLEAR(*p_left);
11556 }
11557 
11558 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11559 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11560 {
11561     PyUnicode_Append(pleft, right);
11562     Py_XDECREF(right);
11563 }
11564 
11565 /*
11566 Wraps stringlib_parse_args_finds() and additionally ensures that the
11567 first argument is a unicode object.
11568 */
11569 
11570 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11571 parse_args_finds_unicode(const char * function_name, PyObject *args,
11572                          PyObject **substring,
11573                          Py_ssize_t *start, Py_ssize_t *end)
11574 {
11575     if(stringlib_parse_args_finds(function_name, args, substring,
11576                                   start, end)) {
11577         if (ensure_unicode(*substring) < 0)
11578             return 0;
11579         return 1;
11580     }
11581     return 0;
11582 }
11583 
11584 PyDoc_STRVAR(count__doc__,
11585              "S.count(sub[, start[, end]]) -> int\n\
11586 \n\
11587 Return the number of non-overlapping occurrences of substring sub in\n\
11588 string S[start:end].  Optional arguments start and end are\n\
11589 interpreted as in slice notation.");
11590 
11591 static PyObject *
unicode_count(PyObject * self,PyObject * args)11592 unicode_count(PyObject *self, PyObject *args)
11593 {
11594     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11595     Py_ssize_t start = 0;
11596     Py_ssize_t end = PY_SSIZE_T_MAX;
11597     PyObject *result;
11598     int kind1, kind2;
11599     const void *buf1, *buf2;
11600     Py_ssize_t len1, len2, iresult;
11601 
11602     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11603         return NULL;
11604 
11605     kind1 = PyUnicode_KIND(self);
11606     kind2 = PyUnicode_KIND(substring);
11607     if (kind1 < kind2)
11608         return PyLong_FromLong(0);
11609 
11610     len1 = PyUnicode_GET_LENGTH(self);
11611     len2 = PyUnicode_GET_LENGTH(substring);
11612     ADJUST_INDICES(start, end, len1);
11613     if (end - start < len2)
11614         return PyLong_FromLong(0);
11615 
11616     buf1 = PyUnicode_DATA(self);
11617     buf2 = PyUnicode_DATA(substring);
11618     if (kind2 != kind1) {
11619         buf2 = unicode_askind(kind2, buf2, len2, kind1);
11620         if (!buf2)
11621             return NULL;
11622     }
11623     switch (kind1) {
11624     case PyUnicode_1BYTE_KIND:
11625         iresult = ucs1lib_count(
11626             ((const Py_UCS1*)buf1) + start, end - start,
11627             buf2, len2, PY_SSIZE_T_MAX
11628             );
11629         break;
11630     case PyUnicode_2BYTE_KIND:
11631         iresult = ucs2lib_count(
11632             ((const Py_UCS2*)buf1) + start, end - start,
11633             buf2, len2, PY_SSIZE_T_MAX
11634             );
11635         break;
11636     case PyUnicode_4BYTE_KIND:
11637         iresult = ucs4lib_count(
11638             ((const Py_UCS4*)buf1) + start, end - start,
11639             buf2, len2, PY_SSIZE_T_MAX
11640             );
11641         break;
11642     default:
11643         Py_UNREACHABLE();
11644     }
11645 
11646     result = PyLong_FromSsize_t(iresult);
11647 
11648     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11649     if (kind2 != kind1)
11650         PyMem_Free((void *)buf2);
11651 
11652     return result;
11653 }
11654 
11655 /*[clinic input]
11656 str.encode as unicode_encode
11657 
11658     encoding: str(c_default="NULL") = 'utf-8'
11659         The encoding in which to encode the string.
11660     errors: str(c_default="NULL") = 'strict'
11661         The error handling scheme to use for encoding errors.
11662         The default is 'strict' meaning that encoding errors raise a
11663         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11664         'xmlcharrefreplace' as well as any other name registered with
11665         codecs.register_error that can handle UnicodeEncodeErrors.
11666 
11667 Encode the string using the codec registered for encoding.
11668 [clinic start generated code]*/
11669 
11670 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11671 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673 {
11674     return PyUnicode_AsEncodedString(self, encoding, errors);
11675 }
11676 
11677 /*[clinic input]
11678 str.expandtabs as unicode_expandtabs
11679 
11680     tabsize: int = 8
11681 
11682 Return a copy where all tab characters are expanded using spaces.
11683 
11684 If tabsize is not given, a tab size of 8 characters is assumed.
11685 [clinic start generated code]*/
11686 
11687 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11688 unicode_expandtabs_impl(PyObject *self, int tabsize)
11689 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690 {
11691     Py_ssize_t i, j, line_pos, src_len, incr;
11692     Py_UCS4 ch;
11693     PyObject *u;
11694     const void *src_data;
11695     void *dest_data;
11696     int kind;
11697     int found;
11698 
11699     if (PyUnicode_READY(self) == -1)
11700         return NULL;
11701 
11702     /* First pass: determine size of output string */
11703     src_len = PyUnicode_GET_LENGTH(self);
11704     i = j = line_pos = 0;
11705     kind = PyUnicode_KIND(self);
11706     src_data = PyUnicode_DATA(self);
11707     found = 0;
11708     for (; i < src_len; i++) {
11709         ch = PyUnicode_READ(kind, src_data, i);
11710         if (ch == '\t') {
11711             found = 1;
11712             if (tabsize > 0) {
11713                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11714                 if (j > PY_SSIZE_T_MAX - incr)
11715                     goto overflow;
11716                 line_pos += incr;
11717                 j += incr;
11718             }
11719         }
11720         else {
11721             if (j > PY_SSIZE_T_MAX - 1)
11722                 goto overflow;
11723             line_pos++;
11724             j++;
11725             if (ch == '\n' || ch == '\r')
11726                 line_pos = 0;
11727         }
11728     }
11729     if (!found)
11730         return unicode_result_unchanged(self);
11731 
11732     /* Second pass: create output string and fill it */
11733     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11734     if (!u)
11735         return NULL;
11736     dest_data = PyUnicode_DATA(u);
11737 
11738     i = j = line_pos = 0;
11739 
11740     for (; i < src_len; i++) {
11741         ch = PyUnicode_READ(kind, src_data, i);
11742         if (ch == '\t') {
11743             if (tabsize > 0) {
11744                 incr = tabsize - (line_pos % tabsize);
11745                 line_pos += incr;
11746                 unicode_fill(kind, dest_data, ' ', j, incr);
11747                 j += incr;
11748             }
11749         }
11750         else {
11751             line_pos++;
11752             PyUnicode_WRITE(kind, dest_data, j, ch);
11753             j++;
11754             if (ch == '\n' || ch == '\r')
11755                 line_pos = 0;
11756         }
11757     }
11758     assert (j == PyUnicode_GET_LENGTH(u));
11759     return unicode_result(u);
11760 
11761   overflow:
11762     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11763     return NULL;
11764 }
11765 
11766 PyDoc_STRVAR(find__doc__,
11767              "S.find(sub[, start[, end]]) -> int\n\
11768 \n\
11769 Return the lowest index in S where substring sub is found,\n\
11770 such that sub is contained within S[start:end].  Optional\n\
11771 arguments start and end are interpreted as in slice notation.\n\
11772 \n\
11773 Return -1 on failure.");
11774 
11775 static PyObject *
unicode_find(PyObject * self,PyObject * args)11776 unicode_find(PyObject *self, PyObject *args)
11777 {
11778     /* initialize variables to prevent gcc warning */
11779     PyObject *substring = NULL;
11780     Py_ssize_t start = 0;
11781     Py_ssize_t end = 0;
11782     Py_ssize_t result;
11783 
11784     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11785         return NULL;
11786 
11787     if (PyUnicode_READY(self) == -1)
11788         return NULL;
11789 
11790     result = any_find_slice(self, substring, start, end, 1);
11791 
11792     if (result == -2)
11793         return NULL;
11794 
11795     return PyLong_FromSsize_t(result);
11796 }
11797 
11798 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11799 unicode_getitem(PyObject *self, Py_ssize_t index)
11800 {
11801     const void *data;
11802     enum PyUnicode_Kind kind;
11803     Py_UCS4 ch;
11804 
11805     if (!PyUnicode_Check(self)) {
11806         PyErr_BadArgument();
11807         return NULL;
11808     }
11809     if (PyUnicode_READY(self) == -1) {
11810         return NULL;
11811     }
11812     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11813         PyErr_SetString(PyExc_IndexError, "string index out of range");
11814         return NULL;
11815     }
11816     kind = PyUnicode_KIND(self);
11817     data = PyUnicode_DATA(self);
11818     ch = PyUnicode_READ(kind, data, index);
11819     return unicode_char(ch);
11820 }
11821 
11822 /* Believe it or not, this produces the same value for ASCII strings
11823    as bytes_hash(). */
11824 static Py_hash_t
unicode_hash(PyObject * self)11825 unicode_hash(PyObject *self)
11826 {
11827     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11828 
11829 #ifdef Py_DEBUG
11830     assert(_Py_HashSecret_Initialized);
11831 #endif
11832     if (_PyUnicode_HASH(self) != -1)
11833         return _PyUnicode_HASH(self);
11834     if (PyUnicode_READY(self) == -1)
11835         return -1;
11836 
11837     x = _Py_HashBytes(PyUnicode_DATA(self),
11838                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11839     _PyUnicode_HASH(self) = x;
11840     return x;
11841 }
11842 
11843 PyDoc_STRVAR(index__doc__,
11844              "S.index(sub[, start[, end]]) -> int\n\
11845 \n\
11846 Return the lowest index in S where substring sub is found,\n\
11847 such that sub is contained within S[start:end].  Optional\n\
11848 arguments start and end are interpreted as in slice notation.\n\
11849 \n\
11850 Raises ValueError when the substring is not found.");
11851 
11852 static PyObject *
unicode_index(PyObject * self,PyObject * args)11853 unicode_index(PyObject *self, PyObject *args)
11854 {
11855     /* initialize variables to prevent gcc warning */
11856     Py_ssize_t result;
11857     PyObject *substring = NULL;
11858     Py_ssize_t start = 0;
11859     Py_ssize_t end = 0;
11860 
11861     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11862         return NULL;
11863 
11864     if (PyUnicode_READY(self) == -1)
11865         return NULL;
11866 
11867     result = any_find_slice(self, substring, start, end, 1);
11868 
11869     if (result == -2)
11870         return NULL;
11871 
11872     if (result < 0) {
11873         PyErr_SetString(PyExc_ValueError, "substring not found");
11874         return NULL;
11875     }
11876 
11877     return PyLong_FromSsize_t(result);
11878 }
11879 
11880 /*[clinic input]
11881 str.isascii as unicode_isascii
11882 
11883 Return True if all characters in the string are ASCII, False otherwise.
11884 
11885 ASCII characters have code points in the range U+0000-U+007F.
11886 Empty string is ASCII too.
11887 [clinic start generated code]*/
11888 
11889 static PyObject *
unicode_isascii_impl(PyObject * self)11890 unicode_isascii_impl(PyObject *self)
11891 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11892 {
11893     if (PyUnicode_READY(self) == -1) {
11894         return NULL;
11895     }
11896     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11897 }
11898 
11899 /*[clinic input]
11900 str.islower as unicode_islower
11901 
11902 Return True if the string is a lowercase string, False otherwise.
11903 
11904 A string is lowercase if all cased characters in the string are lowercase and
11905 there is at least one cased character in the string.
11906 [clinic start generated code]*/
11907 
11908 static PyObject *
unicode_islower_impl(PyObject * self)11909 unicode_islower_impl(PyObject *self)
11910 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11911 {
11912     Py_ssize_t i, length;
11913     int kind;
11914     const void *data;
11915     int cased;
11916 
11917     if (PyUnicode_READY(self) == -1)
11918         return NULL;
11919     length = PyUnicode_GET_LENGTH(self);
11920     kind = PyUnicode_KIND(self);
11921     data = PyUnicode_DATA(self);
11922 
11923     /* Shortcut for single character strings */
11924     if (length == 1)
11925         return PyBool_FromLong(
11926             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11927 
11928     /* Special case for empty strings */
11929     if (length == 0)
11930         Py_RETURN_FALSE;
11931 
11932     cased = 0;
11933     for (i = 0; i < length; i++) {
11934         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11935 
11936         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11937             Py_RETURN_FALSE;
11938         else if (!cased && Py_UNICODE_ISLOWER(ch))
11939             cased = 1;
11940     }
11941     return PyBool_FromLong(cased);
11942 }
11943 
11944 /*[clinic input]
11945 str.isupper as unicode_isupper
11946 
11947 Return True if the string is an uppercase string, False otherwise.
11948 
11949 A string is uppercase if all cased characters in the string are uppercase and
11950 there is at least one cased character in the string.
11951 [clinic start generated code]*/
11952 
11953 static PyObject *
unicode_isupper_impl(PyObject * self)11954 unicode_isupper_impl(PyObject *self)
11955 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11956 {
11957     Py_ssize_t i, length;
11958     int kind;
11959     const void *data;
11960     int cased;
11961 
11962     if (PyUnicode_READY(self) == -1)
11963         return NULL;
11964     length = PyUnicode_GET_LENGTH(self);
11965     kind = PyUnicode_KIND(self);
11966     data = PyUnicode_DATA(self);
11967 
11968     /* Shortcut for single character strings */
11969     if (length == 1)
11970         return PyBool_FromLong(
11971             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11972 
11973     /* Special case for empty strings */
11974     if (length == 0)
11975         Py_RETURN_FALSE;
11976 
11977     cased = 0;
11978     for (i = 0; i < length; i++) {
11979         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980 
11981         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11982             Py_RETURN_FALSE;
11983         else if (!cased && Py_UNICODE_ISUPPER(ch))
11984             cased = 1;
11985     }
11986     return PyBool_FromLong(cased);
11987 }
11988 
11989 /*[clinic input]
11990 str.istitle as unicode_istitle
11991 
11992 Return True if the string is a title-cased string, False otherwise.
11993 
11994 In a title-cased string, upper- and title-case characters may only
11995 follow uncased characters and lowercase characters only cased ones.
11996 [clinic start generated code]*/
11997 
11998 static PyObject *
unicode_istitle_impl(PyObject * self)11999 unicode_istitle_impl(PyObject *self)
12000 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12001 {
12002     Py_ssize_t i, length;
12003     int kind;
12004     const void *data;
12005     int cased, previous_is_cased;
12006 
12007     if (PyUnicode_READY(self) == -1)
12008         return NULL;
12009     length = PyUnicode_GET_LENGTH(self);
12010     kind = PyUnicode_KIND(self);
12011     data = PyUnicode_DATA(self);
12012 
12013     /* Shortcut for single character strings */
12014     if (length == 1) {
12015         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12016         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12017                                (Py_UNICODE_ISUPPER(ch) != 0));
12018     }
12019 
12020     /* Special case for empty strings */
12021     if (length == 0)
12022         Py_RETURN_FALSE;
12023 
12024     cased = 0;
12025     previous_is_cased = 0;
12026     for (i = 0; i < length; i++) {
12027         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12028 
12029         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12030             if (previous_is_cased)
12031                 Py_RETURN_FALSE;
12032             previous_is_cased = 1;
12033             cased = 1;
12034         }
12035         else if (Py_UNICODE_ISLOWER(ch)) {
12036             if (!previous_is_cased)
12037                 Py_RETURN_FALSE;
12038             previous_is_cased = 1;
12039             cased = 1;
12040         }
12041         else
12042             previous_is_cased = 0;
12043     }
12044     return PyBool_FromLong(cased);
12045 }
12046 
12047 /*[clinic input]
12048 str.isspace as unicode_isspace
12049 
12050 Return True if the string is a whitespace string, False otherwise.
12051 
12052 A string is whitespace if all characters in the string are whitespace and there
12053 is at least one character in the string.
12054 [clinic start generated code]*/
12055 
12056 static PyObject *
unicode_isspace_impl(PyObject * self)12057 unicode_isspace_impl(PyObject *self)
12058 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12059 {
12060     Py_ssize_t i, length;
12061     int kind;
12062     const void *data;
12063 
12064     if (PyUnicode_READY(self) == -1)
12065         return NULL;
12066     length = PyUnicode_GET_LENGTH(self);
12067     kind = PyUnicode_KIND(self);
12068     data = PyUnicode_DATA(self);
12069 
12070     /* Shortcut for single character strings */
12071     if (length == 1)
12072         return PyBool_FromLong(
12073             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12074 
12075     /* Special case for empty strings */
12076     if (length == 0)
12077         Py_RETURN_FALSE;
12078 
12079     for (i = 0; i < length; i++) {
12080         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12081         if (!Py_UNICODE_ISSPACE(ch))
12082             Py_RETURN_FALSE;
12083     }
12084     Py_RETURN_TRUE;
12085 }
12086 
12087 /*[clinic input]
12088 str.isalpha as unicode_isalpha
12089 
12090 Return True if the string is an alphabetic string, False otherwise.
12091 
12092 A string is alphabetic if all characters in the string are alphabetic and there
12093 is at least one character in the string.
12094 [clinic start generated code]*/
12095 
12096 static PyObject *
unicode_isalpha_impl(PyObject * self)12097 unicode_isalpha_impl(PyObject *self)
12098 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12099 {
12100     Py_ssize_t i, length;
12101     int kind;
12102     const void *data;
12103 
12104     if (PyUnicode_READY(self) == -1)
12105         return NULL;
12106     length = PyUnicode_GET_LENGTH(self);
12107     kind = PyUnicode_KIND(self);
12108     data = PyUnicode_DATA(self);
12109 
12110     /* Shortcut for single character strings */
12111     if (length == 1)
12112         return PyBool_FromLong(
12113             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12114 
12115     /* Special case for empty strings */
12116     if (length == 0)
12117         Py_RETURN_FALSE;
12118 
12119     for (i = 0; i < length; i++) {
12120         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12121             Py_RETURN_FALSE;
12122     }
12123     Py_RETURN_TRUE;
12124 }
12125 
12126 /*[clinic input]
12127 str.isalnum as unicode_isalnum
12128 
12129 Return True if the string is an alpha-numeric string, False otherwise.
12130 
12131 A string is alpha-numeric if all characters in the string are alpha-numeric and
12132 there is at least one character in the string.
12133 [clinic start generated code]*/
12134 
12135 static PyObject *
unicode_isalnum_impl(PyObject * self)12136 unicode_isalnum_impl(PyObject *self)
12137 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12138 {
12139     int kind;
12140     const void *data;
12141     Py_ssize_t len, i;
12142 
12143     if (PyUnicode_READY(self) == -1)
12144         return NULL;
12145 
12146     kind = PyUnicode_KIND(self);
12147     data = PyUnicode_DATA(self);
12148     len = PyUnicode_GET_LENGTH(self);
12149 
12150     /* Shortcut for single character strings */
12151     if (len == 1) {
12152         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12153         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12154     }
12155 
12156     /* Special case for empty strings */
12157     if (len == 0)
12158         Py_RETURN_FALSE;
12159 
12160     for (i = 0; i < len; i++) {
12161         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12162         if (!Py_UNICODE_ISALNUM(ch))
12163             Py_RETURN_FALSE;
12164     }
12165     Py_RETURN_TRUE;
12166 }
12167 
12168 /*[clinic input]
12169 str.isdecimal as unicode_isdecimal
12170 
12171 Return True if the string is a decimal string, False otherwise.
12172 
12173 A string is a decimal string if all characters in the string are decimal and
12174 there is at least one character in the string.
12175 [clinic start generated code]*/
12176 
12177 static PyObject *
unicode_isdecimal_impl(PyObject * self)12178 unicode_isdecimal_impl(PyObject *self)
12179 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12180 {
12181     Py_ssize_t i, length;
12182     int kind;
12183     const void *data;
12184 
12185     if (PyUnicode_READY(self) == -1)
12186         return NULL;
12187     length = PyUnicode_GET_LENGTH(self);
12188     kind = PyUnicode_KIND(self);
12189     data = PyUnicode_DATA(self);
12190 
12191     /* Shortcut for single character strings */
12192     if (length == 1)
12193         return PyBool_FromLong(
12194             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12195 
12196     /* Special case for empty strings */
12197     if (length == 0)
12198         Py_RETURN_FALSE;
12199 
12200     for (i = 0; i < length; i++) {
12201         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12202             Py_RETURN_FALSE;
12203     }
12204     Py_RETURN_TRUE;
12205 }
12206 
12207 /*[clinic input]
12208 str.isdigit as unicode_isdigit
12209 
12210 Return True if the string is a digit string, False otherwise.
12211 
12212 A string is a digit string if all characters in the string are digits and there
12213 is at least one character in the string.
12214 [clinic start generated code]*/
12215 
12216 static PyObject *
unicode_isdigit_impl(PyObject * self)12217 unicode_isdigit_impl(PyObject *self)
12218 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12219 {
12220     Py_ssize_t i, length;
12221     int kind;
12222     const void *data;
12223 
12224     if (PyUnicode_READY(self) == -1)
12225         return NULL;
12226     length = PyUnicode_GET_LENGTH(self);
12227     kind = PyUnicode_KIND(self);
12228     data = PyUnicode_DATA(self);
12229 
12230     /* Shortcut for single character strings */
12231     if (length == 1) {
12232         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12233         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12234     }
12235 
12236     /* Special case for empty strings */
12237     if (length == 0)
12238         Py_RETURN_FALSE;
12239 
12240     for (i = 0; i < length; i++) {
12241         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12242             Py_RETURN_FALSE;
12243     }
12244     Py_RETURN_TRUE;
12245 }
12246 
12247 /*[clinic input]
12248 str.isnumeric as unicode_isnumeric
12249 
12250 Return True if the string is a numeric string, False otherwise.
12251 
12252 A string is numeric if all characters in the string are numeric and there is at
12253 least one character in the string.
12254 [clinic start generated code]*/
12255 
12256 static PyObject *
unicode_isnumeric_impl(PyObject * self)12257 unicode_isnumeric_impl(PyObject *self)
12258 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12259 {
12260     Py_ssize_t i, length;
12261     int kind;
12262     const void *data;
12263 
12264     if (PyUnicode_READY(self) == -1)
12265         return NULL;
12266     length = PyUnicode_GET_LENGTH(self);
12267     kind = PyUnicode_KIND(self);
12268     data = PyUnicode_DATA(self);
12269 
12270     /* Shortcut for single character strings */
12271     if (length == 1)
12272         return PyBool_FromLong(
12273             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12274 
12275     /* Special case for empty strings */
12276     if (length == 0)
12277         Py_RETURN_FALSE;
12278 
12279     for (i = 0; i < length; i++) {
12280         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12281             Py_RETURN_FALSE;
12282     }
12283     Py_RETURN_TRUE;
12284 }
12285 
12286 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12287 _PyUnicode_ScanIdentifier(PyObject *self)
12288 {
12289     Py_ssize_t i;
12290     if (PyUnicode_READY(self) == -1)
12291         return -1;
12292 
12293     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12294     if (len == 0) {
12295         /* an empty string is not a valid identifier */
12296         return 0;
12297     }
12298 
12299     int kind = PyUnicode_KIND(self);
12300     const void *data = PyUnicode_DATA(self);
12301     Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12302     /* PEP 3131 says that the first character must be in
12303        XID_Start and subsequent characters in XID_Continue,
12304        and for the ASCII range, the 2.x rules apply (i.e
12305        start with letters and underscore, continue with
12306        letters, digits, underscore). However, given the current
12307        definition of XID_Start and XID_Continue, it is sufficient
12308        to check just for these, except that _ must be allowed
12309        as starting an identifier.  */
12310     if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12311         return 0;
12312     }
12313 
12314     for (i = 1; i < len; i++) {
12315         ch = PyUnicode_READ(kind, data, i);
12316         if (!_PyUnicode_IsXidContinue(ch)) {
12317             return i;
12318         }
12319     }
12320     return i;
12321 }
12322 
12323 int
PyUnicode_IsIdentifier(PyObject * self)12324 PyUnicode_IsIdentifier(PyObject *self)
12325 {
12326     if (PyUnicode_IS_READY(self)) {
12327         Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12328         Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12329         /* an empty string is not a valid identifier */
12330         return len && i == len;
12331     }
12332     else {
12333 _Py_COMP_DIAG_PUSH
12334 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12335         Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12336         if (len == 0) {
12337             /* an empty string is not a valid identifier */
12338             return 0;
12339         }
12340 
12341         const wchar_t *wstr = _PyUnicode_WSTR(self);
12342         Py_UCS4 ch = wstr[i++];
12343 #if SIZEOF_WCHAR_T == 2
12344         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12345             && i < len
12346             && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12347         {
12348             ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12349             i++;
12350         }
12351 #endif
12352         if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12353             return 0;
12354         }
12355 
12356         while (i < len) {
12357             ch = wstr[i++];
12358 #if SIZEOF_WCHAR_T == 2
12359             if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12360                 && i < len
12361                 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12362             {
12363                 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12364                 i++;
12365             }
12366 #endif
12367             if (!_PyUnicode_IsXidContinue(ch)) {
12368                 return 0;
12369             }
12370         }
12371         return 1;
12372 _Py_COMP_DIAG_POP
12373     }
12374 }
12375 
12376 /*[clinic input]
12377 str.isidentifier as unicode_isidentifier
12378 
12379 Return True if the string is a valid Python identifier, False otherwise.
12380 
12381 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12382 such as "def" or "class".
12383 [clinic start generated code]*/
12384 
12385 static PyObject *
unicode_isidentifier_impl(PyObject * self)12386 unicode_isidentifier_impl(PyObject *self)
12387 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12388 {
12389     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12390 }
12391 
12392 /*[clinic input]
12393 str.isprintable as unicode_isprintable
12394 
12395 Return True if the string is printable, False otherwise.
12396 
12397 A string is printable if all of its characters are considered printable in
12398 repr() or if it is empty.
12399 [clinic start generated code]*/
12400 
12401 static PyObject *
unicode_isprintable_impl(PyObject * self)12402 unicode_isprintable_impl(PyObject *self)
12403 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12404 {
12405     Py_ssize_t i, length;
12406     int kind;
12407     const void *data;
12408 
12409     if (PyUnicode_READY(self) == -1)
12410         return NULL;
12411     length = PyUnicode_GET_LENGTH(self);
12412     kind = PyUnicode_KIND(self);
12413     data = PyUnicode_DATA(self);
12414 
12415     /* Shortcut for single character strings */
12416     if (length == 1)
12417         return PyBool_FromLong(
12418             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12419 
12420     for (i = 0; i < length; i++) {
12421         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12422             Py_RETURN_FALSE;
12423         }
12424     }
12425     Py_RETURN_TRUE;
12426 }
12427 
12428 /*[clinic input]
12429 str.join as unicode_join
12430 
12431     iterable: object
12432     /
12433 
12434 Concatenate any number of strings.
12435 
12436 The string whose method is called is inserted in between each given string.
12437 The result is returned as a new string.
12438 
12439 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12440 [clinic start generated code]*/
12441 
12442 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12443 unicode_join(PyObject *self, PyObject *iterable)
12444 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12445 {
12446     return PyUnicode_Join(self, iterable);
12447 }
12448 
12449 static Py_ssize_t
unicode_length(PyObject * self)12450 unicode_length(PyObject *self)
12451 {
12452     if (PyUnicode_READY(self) == -1)
12453         return -1;
12454     return PyUnicode_GET_LENGTH(self);
12455 }
12456 
12457 /*[clinic input]
12458 str.ljust as unicode_ljust
12459 
12460     width: Py_ssize_t
12461     fillchar: Py_UCS4 = ' '
12462     /
12463 
12464 Return a left-justified string of length width.
12465 
12466 Padding is done using the specified fill character (default is a space).
12467 [clinic start generated code]*/
12468 
12469 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12470 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12471 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12472 {
12473     if (PyUnicode_READY(self) == -1)
12474         return NULL;
12475 
12476     if (PyUnicode_GET_LENGTH(self) >= width)
12477         return unicode_result_unchanged(self);
12478 
12479     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12480 }
12481 
12482 /*[clinic input]
12483 str.lower as unicode_lower
12484 
12485 Return a copy of the string converted to lowercase.
12486 [clinic start generated code]*/
12487 
12488 static PyObject *
unicode_lower_impl(PyObject * self)12489 unicode_lower_impl(PyObject *self)
12490 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12491 {
12492     if (PyUnicode_READY(self) == -1)
12493         return NULL;
12494     if (PyUnicode_IS_ASCII(self))
12495         return ascii_upper_or_lower(self, 1);
12496     return case_operation(self, do_lower);
12497 }
12498 
12499 #define LEFTSTRIP 0
12500 #define RIGHTSTRIP 1
12501 #define BOTHSTRIP 2
12502 
12503 /* Arrays indexed by above */
12504 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12505 
12506 #define STRIPNAME(i) (stripfuncnames[i])
12507 
12508 /* externally visible for str.strip(unicode) */
12509 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12510 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12511 {
12512     const void *data;
12513     int kind;
12514     Py_ssize_t i, j, len;
12515     BLOOM_MASK sepmask;
12516     Py_ssize_t seplen;
12517 
12518     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12519         return NULL;
12520 
12521     kind = PyUnicode_KIND(self);
12522     data = PyUnicode_DATA(self);
12523     len = PyUnicode_GET_LENGTH(self);
12524     seplen = PyUnicode_GET_LENGTH(sepobj);
12525     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12526                               PyUnicode_DATA(sepobj),
12527                               seplen);
12528 
12529     i = 0;
12530     if (striptype != RIGHTSTRIP) {
12531         while (i < len) {
12532             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12533             if (!BLOOM(sepmask, ch))
12534                 break;
12535             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12536                 break;
12537             i++;
12538         }
12539     }
12540 
12541     j = len;
12542     if (striptype != LEFTSTRIP) {
12543         j--;
12544         while (j >= i) {
12545             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12546             if (!BLOOM(sepmask, ch))
12547                 break;
12548             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12549                 break;
12550             j--;
12551         }
12552 
12553         j++;
12554     }
12555 
12556     return PyUnicode_Substring(self, i, j);
12557 }
12558 
12559 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12560 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12561 {
12562     const unsigned char *data;
12563     int kind;
12564     Py_ssize_t length;
12565 
12566     if (PyUnicode_READY(self) == -1)
12567         return NULL;
12568 
12569     length = PyUnicode_GET_LENGTH(self);
12570     end = Py_MIN(end, length);
12571 
12572     if (start == 0 && end == length)
12573         return unicode_result_unchanged(self);
12574 
12575     if (start < 0 || end < 0) {
12576         PyErr_SetString(PyExc_IndexError, "string index out of range");
12577         return NULL;
12578     }
12579     if (start >= length || end < start)
12580         _Py_RETURN_UNICODE_EMPTY();
12581 
12582     length = end - start;
12583     if (PyUnicode_IS_ASCII(self)) {
12584         data = PyUnicode_1BYTE_DATA(self);
12585         return _PyUnicode_FromASCII((const char*)(data + start), length);
12586     }
12587     else {
12588         kind = PyUnicode_KIND(self);
12589         data = PyUnicode_1BYTE_DATA(self);
12590         return PyUnicode_FromKindAndData(kind,
12591                                          data + kind * start,
12592                                          length);
12593     }
12594 }
12595 
12596 static PyObject *
do_strip(PyObject * self,int striptype)12597 do_strip(PyObject *self, int striptype)
12598 {
12599     Py_ssize_t len, i, j;
12600 
12601     if (PyUnicode_READY(self) == -1)
12602         return NULL;
12603 
12604     len = PyUnicode_GET_LENGTH(self);
12605 
12606     if (PyUnicode_IS_ASCII(self)) {
12607         const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12608 
12609         i = 0;
12610         if (striptype != RIGHTSTRIP) {
12611             while (i < len) {
12612                 Py_UCS1 ch = data[i];
12613                 if (!_Py_ascii_whitespace[ch])
12614                     break;
12615                 i++;
12616             }
12617         }
12618 
12619         j = len;
12620         if (striptype != LEFTSTRIP) {
12621             j--;
12622             while (j >= i) {
12623                 Py_UCS1 ch = data[j];
12624                 if (!_Py_ascii_whitespace[ch])
12625                     break;
12626                 j--;
12627             }
12628             j++;
12629         }
12630     }
12631     else {
12632         int kind = PyUnicode_KIND(self);
12633         const void *data = PyUnicode_DATA(self);
12634 
12635         i = 0;
12636         if (striptype != RIGHTSTRIP) {
12637             while (i < len) {
12638                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12639                 if (!Py_UNICODE_ISSPACE(ch))
12640                     break;
12641                 i++;
12642             }
12643         }
12644 
12645         j = len;
12646         if (striptype != LEFTSTRIP) {
12647             j--;
12648             while (j >= i) {
12649                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12650                 if (!Py_UNICODE_ISSPACE(ch))
12651                     break;
12652                 j--;
12653             }
12654             j++;
12655         }
12656     }
12657 
12658     return PyUnicode_Substring(self, i, j);
12659 }
12660 
12661 
12662 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12663 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12664 {
12665     if (sep != Py_None) {
12666         if (PyUnicode_Check(sep))
12667             return _PyUnicode_XStrip(self, striptype, sep);
12668         else {
12669             PyErr_Format(PyExc_TypeError,
12670                          "%s arg must be None or str",
12671                          STRIPNAME(striptype));
12672             return NULL;
12673         }
12674     }
12675 
12676     return do_strip(self, striptype);
12677 }
12678 
12679 
12680 /*[clinic input]
12681 str.strip as unicode_strip
12682 
12683     chars: object = None
12684     /
12685 
12686 Return a copy of the string with leading and trailing whitespace removed.
12687 
12688 If chars is given and not None, remove characters in chars instead.
12689 [clinic start generated code]*/
12690 
12691 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12692 unicode_strip_impl(PyObject *self, PyObject *chars)
12693 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12694 {
12695     return do_argstrip(self, BOTHSTRIP, chars);
12696 }
12697 
12698 
12699 /*[clinic input]
12700 str.lstrip as unicode_lstrip
12701 
12702     chars: object = None
12703     /
12704 
12705 Return a copy of the string with leading whitespace removed.
12706 
12707 If chars is given and not None, remove characters in chars instead.
12708 [clinic start generated code]*/
12709 
12710 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12711 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12712 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12713 {
12714     return do_argstrip(self, LEFTSTRIP, chars);
12715 }
12716 
12717 
12718 /*[clinic input]
12719 str.rstrip as unicode_rstrip
12720 
12721     chars: object = None
12722     /
12723 
12724 Return a copy of the string with trailing whitespace removed.
12725 
12726 If chars is given and not None, remove characters in chars instead.
12727 [clinic start generated code]*/
12728 
12729 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12730 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12731 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12732 {
12733     return do_argstrip(self, RIGHTSTRIP, chars);
12734 }
12735 
12736 
12737 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12738 unicode_repeat(PyObject *str, Py_ssize_t len)
12739 {
12740     PyObject *u;
12741     Py_ssize_t nchars, n;
12742 
12743     if (len < 1)
12744         _Py_RETURN_UNICODE_EMPTY();
12745 
12746     /* no repeat, return original string */
12747     if (len == 1)
12748         return unicode_result_unchanged(str);
12749 
12750     if (PyUnicode_READY(str) == -1)
12751         return NULL;
12752 
12753     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12754         PyErr_SetString(PyExc_OverflowError,
12755                         "repeated string is too long");
12756         return NULL;
12757     }
12758     nchars = len * PyUnicode_GET_LENGTH(str);
12759 
12760     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12761     if (!u)
12762         return NULL;
12763     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12764 
12765     if (PyUnicode_GET_LENGTH(str) == 1) {
12766         int kind = PyUnicode_KIND(str);
12767         Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12768         if (kind == PyUnicode_1BYTE_KIND) {
12769             void *to = PyUnicode_DATA(u);
12770             memset(to, (unsigned char)fill_char, len);
12771         }
12772         else if (kind == PyUnicode_2BYTE_KIND) {
12773             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12774             for (n = 0; n < len; ++n)
12775                 ucs2[n] = fill_char;
12776         } else {
12777             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12778             assert(kind == PyUnicode_4BYTE_KIND);
12779             for (n = 0; n < len; ++n)
12780                 ucs4[n] = fill_char;
12781         }
12782     }
12783     else {
12784         Py_ssize_t char_size = PyUnicode_KIND(str);
12785         char *to = (char *) PyUnicode_DATA(u);
12786         _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12787             PyUnicode_GET_LENGTH(str) * char_size);
12788     }
12789 
12790     assert(_PyUnicode_CheckConsistency(u, 1));
12791     return u;
12792 }
12793 
12794 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12795 PyUnicode_Replace(PyObject *str,
12796                   PyObject *substr,
12797                   PyObject *replstr,
12798                   Py_ssize_t maxcount)
12799 {
12800     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12801             ensure_unicode(replstr) < 0)
12802         return NULL;
12803     return replace(str, substr, replstr, maxcount);
12804 }
12805 
12806 /*[clinic input]
12807 str.replace as unicode_replace
12808 
12809     old: unicode
12810     new: unicode
12811     count: Py_ssize_t = -1
12812         Maximum number of occurrences to replace.
12813         -1 (the default value) means replace all occurrences.
12814     /
12815 
12816 Return a copy with all occurrences of substring old replaced by new.
12817 
12818 If the optional argument count is given, only the first count occurrences are
12819 replaced.
12820 [clinic start generated code]*/
12821 
12822 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12823 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12824                      Py_ssize_t count)
12825 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12826 {
12827     if (PyUnicode_READY(self) == -1)
12828         return NULL;
12829     return replace(self, old, new, count);
12830 }
12831 
12832 /*[clinic input]
12833 str.removeprefix as unicode_removeprefix
12834 
12835     prefix: unicode
12836     /
12837 
12838 Return a str with the given prefix string removed if present.
12839 
12840 If the string starts with the prefix string, return string[len(prefix):].
12841 Otherwise, return a copy of the original string.
12842 [clinic start generated code]*/
12843 
12844 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12845 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12846 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12847 {
12848     int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12849     if (match == -1) {
12850         return NULL;
12851     }
12852     if (match) {
12853         return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12854                                    PyUnicode_GET_LENGTH(self));
12855     }
12856     return unicode_result_unchanged(self);
12857 }
12858 
12859 /*[clinic input]
12860 str.removesuffix as unicode_removesuffix
12861 
12862     suffix: unicode
12863     /
12864 
12865 Return a str with the given suffix string removed if present.
12866 
12867 If the string ends with the suffix string and that suffix is not empty,
12868 return string[:-len(suffix)]. Otherwise, return a copy of the original
12869 string.
12870 [clinic start generated code]*/
12871 
12872 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12873 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12874 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12875 {
12876     int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12877     if (match == -1) {
12878         return NULL;
12879     }
12880     if (match) {
12881         return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12882                                             - PyUnicode_GET_LENGTH(suffix));
12883     }
12884     return unicode_result_unchanged(self);
12885 }
12886 
12887 static PyObject *
unicode_repr(PyObject * unicode)12888 unicode_repr(PyObject *unicode)
12889 {
12890     PyObject *repr;
12891     Py_ssize_t isize;
12892     Py_ssize_t osize, squote, dquote, i, o;
12893     Py_UCS4 max, quote;
12894     int ikind, okind, unchanged;
12895     const void *idata;
12896     void *odata;
12897 
12898     if (PyUnicode_READY(unicode) == -1)
12899         return NULL;
12900 
12901     isize = PyUnicode_GET_LENGTH(unicode);
12902     idata = PyUnicode_DATA(unicode);
12903 
12904     /* Compute length of output, quote characters, and
12905        maximum character */
12906     osize = 0;
12907     max = 127;
12908     squote = dquote = 0;
12909     ikind = PyUnicode_KIND(unicode);
12910     for (i = 0; i < isize; i++) {
12911         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12912         Py_ssize_t incr = 1;
12913         switch (ch) {
12914         case '\'': squote++; break;
12915         case '"':  dquote++; break;
12916         case '\\': case '\t': case '\r': case '\n':
12917             incr = 2;
12918             break;
12919         default:
12920             /* Fast-path ASCII */
12921             if (ch < ' ' || ch == 0x7f)
12922                 incr = 4; /* \xHH */
12923             else if (ch < 0x7f)
12924                 ;
12925             else if (Py_UNICODE_ISPRINTABLE(ch))
12926                 max = ch > max ? ch : max;
12927             else if (ch < 0x100)
12928                 incr = 4; /* \xHH */
12929             else if (ch < 0x10000)
12930                 incr = 6; /* \uHHHH */
12931             else
12932                 incr = 10; /* \uHHHHHHHH */
12933         }
12934         if (osize > PY_SSIZE_T_MAX - incr) {
12935             PyErr_SetString(PyExc_OverflowError,
12936                             "string is too long to generate repr");
12937             return NULL;
12938         }
12939         osize += incr;
12940     }
12941 
12942     quote = '\'';
12943     unchanged = (osize == isize);
12944     if (squote) {
12945         unchanged = 0;
12946         if (dquote)
12947             /* Both squote and dquote present. Use squote,
12948                and escape them */
12949             osize += squote;
12950         else
12951             quote = '"';
12952     }
12953     osize += 2;   /* quotes */
12954 
12955     repr = PyUnicode_New(osize, max);
12956     if (repr == NULL)
12957         return NULL;
12958     okind = PyUnicode_KIND(repr);
12959     odata = PyUnicode_DATA(repr);
12960 
12961     PyUnicode_WRITE(okind, odata, 0, quote);
12962     PyUnicode_WRITE(okind, odata, osize-1, quote);
12963     if (unchanged) {
12964         _PyUnicode_FastCopyCharacters(repr, 1,
12965                                       unicode, 0,
12966                                       isize);
12967     }
12968     else {
12969         for (i = 0, o = 1; i < isize; i++) {
12970             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12971 
12972             /* Escape quotes and backslashes */
12973             if ((ch == quote) || (ch == '\\')) {
12974                 PyUnicode_WRITE(okind, odata, o++, '\\');
12975                 PyUnicode_WRITE(okind, odata, o++, ch);
12976                 continue;
12977             }
12978 
12979             /* Map special whitespace to '\t', \n', '\r' */
12980             if (ch == '\t') {
12981                 PyUnicode_WRITE(okind, odata, o++, '\\');
12982                 PyUnicode_WRITE(okind, odata, o++, 't');
12983             }
12984             else if (ch == '\n') {
12985                 PyUnicode_WRITE(okind, odata, o++, '\\');
12986                 PyUnicode_WRITE(okind, odata, o++, 'n');
12987             }
12988             else if (ch == '\r') {
12989                 PyUnicode_WRITE(okind, odata, o++, '\\');
12990                 PyUnicode_WRITE(okind, odata, o++, 'r');
12991             }
12992 
12993             /* Map non-printable US ASCII to '\xhh' */
12994             else if (ch < ' ' || ch == 0x7F) {
12995                 PyUnicode_WRITE(okind, odata, o++, '\\');
12996                 PyUnicode_WRITE(okind, odata, o++, 'x');
12997                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12998                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12999             }
13000 
13001             /* Copy ASCII characters as-is */
13002             else if (ch < 0x7F) {
13003                 PyUnicode_WRITE(okind, odata, o++, ch);
13004             }
13005 
13006             /* Non-ASCII characters */
13007             else {
13008                 /* Map Unicode whitespace and control characters
13009                    (categories Z* and C* except ASCII space)
13010                 */
13011                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13012                     PyUnicode_WRITE(okind, odata, o++, '\\');
13013                     /* Map 8-bit characters to '\xhh' */
13014                     if (ch <= 0xff) {
13015                         PyUnicode_WRITE(okind, odata, o++, 'x');
13016                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13017                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13018                     }
13019                     /* Map 16-bit characters to '\uxxxx' */
13020                     else if (ch <= 0xffff) {
13021                         PyUnicode_WRITE(okind, odata, o++, 'u');
13022                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13023                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13024                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13025                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13026                     }
13027                     /* Map 21-bit characters to '\U00xxxxxx' */
13028                     else {
13029                         PyUnicode_WRITE(okind, odata, o++, 'U');
13030                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13031                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13032                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13033                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13034                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13035                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13036                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13037                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13038                     }
13039                 }
13040                 /* Copy characters as-is */
13041                 else {
13042                     PyUnicode_WRITE(okind, odata, o++, ch);
13043                 }
13044             }
13045         }
13046     }
13047     /* Closing quote already added at the beginning */
13048     assert(_PyUnicode_CheckConsistency(repr, 1));
13049     return repr;
13050 }
13051 
13052 PyDoc_STRVAR(rfind__doc__,
13053              "S.rfind(sub[, start[, end]]) -> int\n\
13054 \n\
13055 Return the highest index in S where substring sub is found,\n\
13056 such that sub is contained within S[start:end].  Optional\n\
13057 arguments start and end are interpreted as in slice notation.\n\
13058 \n\
13059 Return -1 on failure.");
13060 
13061 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13062 unicode_rfind(PyObject *self, PyObject *args)
13063 {
13064     /* initialize variables to prevent gcc warning */
13065     PyObject *substring = NULL;
13066     Py_ssize_t start = 0;
13067     Py_ssize_t end = 0;
13068     Py_ssize_t result;
13069 
13070     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13071         return NULL;
13072 
13073     if (PyUnicode_READY(self) == -1)
13074         return NULL;
13075 
13076     result = any_find_slice(self, substring, start, end, -1);
13077 
13078     if (result == -2)
13079         return NULL;
13080 
13081     return PyLong_FromSsize_t(result);
13082 }
13083 
13084 PyDoc_STRVAR(rindex__doc__,
13085              "S.rindex(sub[, start[, end]]) -> int\n\
13086 \n\
13087 Return the highest index in S where substring sub is found,\n\
13088 such that sub is contained within S[start:end].  Optional\n\
13089 arguments start and end are interpreted as in slice notation.\n\
13090 \n\
13091 Raises ValueError when the substring is not found.");
13092 
13093 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13094 unicode_rindex(PyObject *self, PyObject *args)
13095 {
13096     /* initialize variables to prevent gcc warning */
13097     PyObject *substring = NULL;
13098     Py_ssize_t start = 0;
13099     Py_ssize_t end = 0;
13100     Py_ssize_t result;
13101 
13102     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13103         return NULL;
13104 
13105     if (PyUnicode_READY(self) == -1)
13106         return NULL;
13107 
13108     result = any_find_slice(self, substring, start, end, -1);
13109 
13110     if (result == -2)
13111         return NULL;
13112 
13113     if (result < 0) {
13114         PyErr_SetString(PyExc_ValueError, "substring not found");
13115         return NULL;
13116     }
13117 
13118     return PyLong_FromSsize_t(result);
13119 }
13120 
13121 /*[clinic input]
13122 str.rjust as unicode_rjust
13123 
13124     width: Py_ssize_t
13125     fillchar: Py_UCS4 = ' '
13126     /
13127 
13128 Return a right-justified string of length width.
13129 
13130 Padding is done using the specified fill character (default is a space).
13131 [clinic start generated code]*/
13132 
13133 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13134 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13135 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13136 {
13137     if (PyUnicode_READY(self) == -1)
13138         return NULL;
13139 
13140     if (PyUnicode_GET_LENGTH(self) >= width)
13141         return unicode_result_unchanged(self);
13142 
13143     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13144 }
13145 
13146 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13147 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13148 {
13149     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13150         return NULL;
13151 
13152     return split(s, sep, maxsplit);
13153 }
13154 
13155 /*[clinic input]
13156 str.split as unicode_split
13157 
13158     sep: object = None
13159         The separator used to split the string.
13160 
13161         When set to None (the default value), will split on any whitespace
13162         character (including \\n \\r \\t \\f and spaces) and will discard
13163         empty strings from the result.
13164     maxsplit: Py_ssize_t = -1
13165         Maximum number of splits (starting from the left).
13166         -1 (the default value) means no limit.
13167 
13168 Return a list of the substrings in the string, using sep as the separator string.
13169 
13170 Note, str.split() is mainly useful for data that has been intentionally
13171 delimited.  With natural text that includes punctuation, consider using
13172 the regular expression module.
13173 
13174 [clinic start generated code]*/
13175 
13176 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13177 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13178 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13179 {
13180     if (sep == Py_None)
13181         return split(self, NULL, maxsplit);
13182     if (PyUnicode_Check(sep))
13183         return split(self, sep, maxsplit);
13184 
13185     PyErr_Format(PyExc_TypeError,
13186                  "must be str or None, not %.100s",
13187                  Py_TYPE(sep)->tp_name);
13188     return NULL;
13189 }
13190 
13191 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13192 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13193 {
13194     PyObject* out;
13195     int kind1, kind2;
13196     const void *buf1, *buf2;
13197     Py_ssize_t len1, len2;
13198 
13199     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13200         return NULL;
13201 
13202     kind1 = PyUnicode_KIND(str_obj);
13203     kind2 = PyUnicode_KIND(sep_obj);
13204     len1 = PyUnicode_GET_LENGTH(str_obj);
13205     len2 = PyUnicode_GET_LENGTH(sep_obj);
13206     if (kind1 < kind2 || len1 < len2) {
13207         PyObject *empty = unicode_get_empty();  // Borrowed reference
13208         return PyTuple_Pack(3, str_obj, empty, empty);
13209     }
13210     buf1 = PyUnicode_DATA(str_obj);
13211     buf2 = PyUnicode_DATA(sep_obj);
13212     if (kind2 != kind1) {
13213         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13214         if (!buf2)
13215             return NULL;
13216     }
13217 
13218     switch (kind1) {
13219     case PyUnicode_1BYTE_KIND:
13220         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13221             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13222         else
13223             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13224         break;
13225     case PyUnicode_2BYTE_KIND:
13226         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13227         break;
13228     case PyUnicode_4BYTE_KIND:
13229         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13230         break;
13231     default:
13232         Py_UNREACHABLE();
13233     }
13234 
13235     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13236     if (kind2 != kind1)
13237         PyMem_Free((void *)buf2);
13238 
13239     return out;
13240 }
13241 
13242 
13243 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13244 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13245 {
13246     PyObject* out;
13247     int kind1, kind2;
13248     const void *buf1, *buf2;
13249     Py_ssize_t len1, len2;
13250 
13251     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13252         return NULL;
13253 
13254     kind1 = PyUnicode_KIND(str_obj);
13255     kind2 = PyUnicode_KIND(sep_obj);
13256     len1 = PyUnicode_GET_LENGTH(str_obj);
13257     len2 = PyUnicode_GET_LENGTH(sep_obj);
13258     if (kind1 < kind2 || len1 < len2) {
13259         PyObject *empty = unicode_get_empty();  // Borrowed reference
13260         return PyTuple_Pack(3, empty, empty, str_obj);
13261     }
13262     buf1 = PyUnicode_DATA(str_obj);
13263     buf2 = PyUnicode_DATA(sep_obj);
13264     if (kind2 != kind1) {
13265         buf2 = unicode_askind(kind2, buf2, len2, kind1);
13266         if (!buf2)
13267             return NULL;
13268     }
13269 
13270     switch (kind1) {
13271     case PyUnicode_1BYTE_KIND:
13272         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13273             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13274         else
13275             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276         break;
13277     case PyUnicode_2BYTE_KIND:
13278         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279         break;
13280     case PyUnicode_4BYTE_KIND:
13281         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13282         break;
13283     default:
13284         Py_UNREACHABLE();
13285     }
13286 
13287     assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13288     if (kind2 != kind1)
13289         PyMem_Free((void *)buf2);
13290 
13291     return out;
13292 }
13293 
13294 /*[clinic input]
13295 str.partition as unicode_partition
13296 
13297     sep: object
13298     /
13299 
13300 Partition the string into three parts using the given separator.
13301 
13302 This will search for the separator in the string.  If the separator is found,
13303 returns a 3-tuple containing the part before the separator, the separator
13304 itself, and the part after it.
13305 
13306 If the separator is not found, returns a 3-tuple containing the original string
13307 and two empty strings.
13308 [clinic start generated code]*/
13309 
13310 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13311 unicode_partition(PyObject *self, PyObject *sep)
13312 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13313 {
13314     return PyUnicode_Partition(self, sep);
13315 }
13316 
13317 /*[clinic input]
13318 str.rpartition as unicode_rpartition = str.partition
13319 
13320 Partition the string into three parts using the given separator.
13321 
13322 This will search for the separator in the string, starting at the end. If
13323 the separator is found, returns a 3-tuple containing the part before the
13324 separator, the separator itself, and the part after it.
13325 
13326 If the separator is not found, returns a 3-tuple containing two empty strings
13327 and the original string.
13328 [clinic start generated code]*/
13329 
13330 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13331 unicode_rpartition(PyObject *self, PyObject *sep)
13332 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13333 {
13334     return PyUnicode_RPartition(self, sep);
13335 }
13336 
13337 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13338 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13339 {
13340     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13341         return NULL;
13342 
13343     return rsplit(s, sep, maxsplit);
13344 }
13345 
13346 /*[clinic input]
13347 str.rsplit as unicode_rsplit = str.split
13348 
13349 Return a list of the substrings in the string, using sep as the separator string.
13350 
13351 Splitting starts at the end of the string and works to the front.
13352 [clinic start generated code]*/
13353 
13354 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13355 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13356 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13357 {
13358     if (sep == Py_None)
13359         return rsplit(self, NULL, maxsplit);
13360     if (PyUnicode_Check(sep))
13361         return rsplit(self, sep, maxsplit);
13362 
13363     PyErr_Format(PyExc_TypeError,
13364                  "must be str or None, not %.100s",
13365                  Py_TYPE(sep)->tp_name);
13366     return NULL;
13367 }
13368 
13369 /*[clinic input]
13370 str.splitlines as unicode_splitlines
13371 
13372     keepends: bool(accept={int}) = False
13373 
13374 Return a list of the lines in the string, breaking at line boundaries.
13375 
13376 Line breaks are not included in the resulting list unless keepends is given and
13377 true.
13378 [clinic start generated code]*/
13379 
13380 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13381 unicode_splitlines_impl(PyObject *self, int keepends)
13382 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13383 {
13384     return PyUnicode_Splitlines(self, keepends);
13385 }
13386 
13387 static
unicode_str(PyObject * self)13388 PyObject *unicode_str(PyObject *self)
13389 {
13390     return unicode_result_unchanged(self);
13391 }
13392 
13393 /*[clinic input]
13394 str.swapcase as unicode_swapcase
13395 
13396 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13397 [clinic start generated code]*/
13398 
13399 static PyObject *
unicode_swapcase_impl(PyObject * self)13400 unicode_swapcase_impl(PyObject *self)
13401 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13402 {
13403     if (PyUnicode_READY(self) == -1)
13404         return NULL;
13405     return case_operation(self, do_swapcase);
13406 }
13407 
13408 /*[clinic input]
13409 
13410 @staticmethod
13411 str.maketrans as unicode_maketrans
13412 
13413   x: object
13414 
13415   y: unicode=NULL
13416 
13417   z: unicode=NULL
13418 
13419   /
13420 
13421 Return a translation table usable for str.translate().
13422 
13423 If there is only one argument, it must be a dictionary mapping Unicode
13424 ordinals (integers) or characters to Unicode ordinals, strings or None.
13425 Character keys will be then converted to ordinals.
13426 If there are two arguments, they must be strings of equal length, and
13427 in the resulting dictionary, each character in x will be mapped to the
13428 character at the same position in y. If there is a third argument, it
13429 must be a string, whose characters will be mapped to None in the result.
13430 [clinic start generated code]*/
13431 
13432 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13433 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13434 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13435 {
13436     PyObject *new = NULL, *key, *value;
13437     Py_ssize_t i = 0;
13438     int res;
13439 
13440     new = PyDict_New();
13441     if (!new)
13442         return NULL;
13443     if (y != NULL) {
13444         int x_kind, y_kind, z_kind;
13445         const void *x_data, *y_data, *z_data;
13446 
13447         /* x must be a string too, of equal length */
13448         if (!PyUnicode_Check(x)) {
13449             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13450                             "be a string if there is a second argument");
13451             goto err;
13452         }
13453         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13454             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13455                             "arguments must have equal length");
13456             goto err;
13457         }
13458         /* create entries for translating chars in x to those in y */
13459         x_kind = PyUnicode_KIND(x);
13460         y_kind = PyUnicode_KIND(y);
13461         x_data = PyUnicode_DATA(x);
13462         y_data = PyUnicode_DATA(y);
13463         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13464             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13465             if (!key)
13466                 goto err;
13467             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13468             if (!value) {
13469                 Py_DECREF(key);
13470                 goto err;
13471             }
13472             res = PyDict_SetItem(new, key, value);
13473             Py_DECREF(key);
13474             Py_DECREF(value);
13475             if (res < 0)
13476                 goto err;
13477         }
13478         /* create entries for deleting chars in z */
13479         if (z != NULL) {
13480             z_kind = PyUnicode_KIND(z);
13481             z_data = PyUnicode_DATA(z);
13482             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13483                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13484                 if (!key)
13485                     goto err;
13486                 res = PyDict_SetItem(new, key, Py_None);
13487                 Py_DECREF(key);
13488                 if (res < 0)
13489                     goto err;
13490             }
13491         }
13492     } else {
13493         int kind;
13494         const void *data;
13495 
13496         /* x must be a dict */
13497         if (!PyDict_CheckExact(x)) {
13498             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13499                             "to maketrans it must be a dict");
13500             goto err;
13501         }
13502         /* copy entries into the new dict, converting string keys to int keys */
13503         while (PyDict_Next(x, &i, &key, &value)) {
13504             if (PyUnicode_Check(key)) {
13505                 /* convert string keys to integer keys */
13506                 PyObject *newkey;
13507                 if (PyUnicode_GET_LENGTH(key) != 1) {
13508                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13509                                     "table must be of length 1");
13510                     goto err;
13511                 }
13512                 kind = PyUnicode_KIND(key);
13513                 data = PyUnicode_DATA(key);
13514                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13515                 if (!newkey)
13516                     goto err;
13517                 res = PyDict_SetItem(new, newkey, value);
13518                 Py_DECREF(newkey);
13519                 if (res < 0)
13520                     goto err;
13521             } else if (PyLong_Check(key)) {
13522                 /* just keep integer keys */
13523                 if (PyDict_SetItem(new, key, value) < 0)
13524                     goto err;
13525             } else {
13526                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13527                                 "be strings or integers");
13528                 goto err;
13529             }
13530         }
13531     }
13532     return new;
13533   err:
13534     Py_DECREF(new);
13535     return NULL;
13536 }
13537 
13538 /*[clinic input]
13539 str.translate as unicode_translate
13540 
13541     table: object
13542         Translation table, which must be a mapping of Unicode ordinals to
13543         Unicode ordinals, strings, or None.
13544     /
13545 
13546 Replace each character in the string using the given translation table.
13547 
13548 The table must implement lookup/indexing via __getitem__, for instance a
13549 dictionary or list.  If this operation raises LookupError, the character is
13550 left untouched.  Characters mapped to None are deleted.
13551 [clinic start generated code]*/
13552 
13553 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13554 unicode_translate(PyObject *self, PyObject *table)
13555 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13556 {
13557     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13558 }
13559 
13560 /*[clinic input]
13561 str.upper as unicode_upper
13562 
13563 Return a copy of the string converted to uppercase.
13564 [clinic start generated code]*/
13565 
13566 static PyObject *
unicode_upper_impl(PyObject * self)13567 unicode_upper_impl(PyObject *self)
13568 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13569 {
13570     if (PyUnicode_READY(self) == -1)
13571         return NULL;
13572     if (PyUnicode_IS_ASCII(self))
13573         return ascii_upper_or_lower(self, 0);
13574     return case_operation(self, do_upper);
13575 }
13576 
13577 /*[clinic input]
13578 str.zfill as unicode_zfill
13579 
13580     width: Py_ssize_t
13581     /
13582 
13583 Pad a numeric string with zeros on the left, to fill a field of the given width.
13584 
13585 The string is never truncated.
13586 [clinic start generated code]*/
13587 
13588 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13589 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13590 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13591 {
13592     Py_ssize_t fill;
13593     PyObject *u;
13594     int kind;
13595     const void *data;
13596     Py_UCS4 chr;
13597 
13598     if (PyUnicode_READY(self) == -1)
13599         return NULL;
13600 
13601     if (PyUnicode_GET_LENGTH(self) >= width)
13602         return unicode_result_unchanged(self);
13603 
13604     fill = width - PyUnicode_GET_LENGTH(self);
13605 
13606     u = pad(self, fill, 0, '0');
13607 
13608     if (u == NULL)
13609         return NULL;
13610 
13611     kind = PyUnicode_KIND(u);
13612     data = PyUnicode_DATA(u);
13613     chr = PyUnicode_READ(kind, data, fill);
13614 
13615     if (chr == '+' || chr == '-') {
13616         /* move sign to beginning of string */
13617         PyUnicode_WRITE(kind, data, 0, chr);
13618         PyUnicode_WRITE(kind, data, fill, '0');
13619     }
13620 
13621     assert(_PyUnicode_CheckConsistency(u, 1));
13622     return u;
13623 }
13624 
13625 PyDoc_STRVAR(startswith__doc__,
13626              "S.startswith(prefix[, start[, end]]) -> bool\n\
13627 \n\
13628 Return True if S starts with the specified prefix, False otherwise.\n\
13629 With optional start, test S beginning at that position.\n\
13630 With optional end, stop comparing S at that position.\n\
13631 prefix can also be a tuple of strings to try.");
13632 
13633 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13634 unicode_startswith(PyObject *self,
13635                    PyObject *args)
13636 {
13637     PyObject *subobj;
13638     PyObject *substring;
13639     Py_ssize_t start = 0;
13640     Py_ssize_t end = PY_SSIZE_T_MAX;
13641     int result;
13642 
13643     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13644         return NULL;
13645     if (PyTuple_Check(subobj)) {
13646         Py_ssize_t i;
13647         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13648             substring = PyTuple_GET_ITEM(subobj, i);
13649             if (!PyUnicode_Check(substring)) {
13650                 PyErr_Format(PyExc_TypeError,
13651                              "tuple for startswith must only contain str, "
13652                              "not %.100s",
13653                              Py_TYPE(substring)->tp_name);
13654                 return NULL;
13655             }
13656             result = tailmatch(self, substring, start, end, -1);
13657             if (result == -1)
13658                 return NULL;
13659             if (result) {
13660                 Py_RETURN_TRUE;
13661             }
13662         }
13663         /* nothing matched */
13664         Py_RETURN_FALSE;
13665     }
13666     if (!PyUnicode_Check(subobj)) {
13667         PyErr_Format(PyExc_TypeError,
13668                      "startswith first arg must be str or "
13669                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13670         return NULL;
13671     }
13672     result = tailmatch(self, subobj, start, end, -1);
13673     if (result == -1)
13674         return NULL;
13675     return PyBool_FromLong(result);
13676 }
13677 
13678 
13679 PyDoc_STRVAR(endswith__doc__,
13680              "S.endswith(suffix[, start[, end]]) -> bool\n\
13681 \n\
13682 Return True if S ends with the specified suffix, False otherwise.\n\
13683 With optional start, test S beginning at that position.\n\
13684 With optional end, stop comparing S at that position.\n\
13685 suffix can also be a tuple of strings to try.");
13686 
13687 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13688 unicode_endswith(PyObject *self,
13689                  PyObject *args)
13690 {
13691     PyObject *subobj;
13692     PyObject *substring;
13693     Py_ssize_t start = 0;
13694     Py_ssize_t end = PY_SSIZE_T_MAX;
13695     int result;
13696 
13697     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13698         return NULL;
13699     if (PyTuple_Check(subobj)) {
13700         Py_ssize_t i;
13701         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13702             substring = PyTuple_GET_ITEM(subobj, i);
13703             if (!PyUnicode_Check(substring)) {
13704                 PyErr_Format(PyExc_TypeError,
13705                              "tuple for endswith must only contain str, "
13706                              "not %.100s",
13707                              Py_TYPE(substring)->tp_name);
13708                 return NULL;
13709             }
13710             result = tailmatch(self, substring, start, end, +1);
13711             if (result == -1)
13712                 return NULL;
13713             if (result) {
13714                 Py_RETURN_TRUE;
13715             }
13716         }
13717         Py_RETURN_FALSE;
13718     }
13719     if (!PyUnicode_Check(subobj)) {
13720         PyErr_Format(PyExc_TypeError,
13721                      "endswith first arg must be str or "
13722                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13723         return NULL;
13724     }
13725     result = tailmatch(self, subobj, start, end, +1);
13726     if (result == -1)
13727         return NULL;
13728     return PyBool_FromLong(result);
13729 }
13730 
13731 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13732 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13733 {
13734     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13735     writer->data = PyUnicode_DATA(writer->buffer);
13736 
13737     if (!writer->readonly) {
13738         writer->kind = PyUnicode_KIND(writer->buffer);
13739         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13740     }
13741     else {
13742         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13743            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13744         writer->kind = PyUnicode_WCHAR_KIND;
13745         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13746 
13747         /* Copy-on-write mode: set buffer size to 0 so
13748          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13749          * next write. */
13750         writer->size = 0;
13751     }
13752 }
13753 
13754 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13755 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13756 {
13757     memset(writer, 0, sizeof(*writer));
13758 
13759     /* ASCII is the bare minimum */
13760     writer->min_char = 127;
13761 
13762     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13763        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13764     writer->kind = PyUnicode_WCHAR_KIND;
13765     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13766 }
13767 
13768 // Initialize _PyUnicodeWriter with initial buffer
13769 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13770 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13771 {
13772     memset(writer, 0, sizeof(*writer));
13773     writer->buffer = buffer;
13774     _PyUnicodeWriter_Update(writer);
13775     writer->min_length = writer->size;
13776 }
13777 
13778 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13779 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13780                                  Py_ssize_t length, Py_UCS4 maxchar)
13781 {
13782     Py_ssize_t newlen;
13783     PyObject *newbuffer;
13784 
13785     assert(maxchar <= MAX_UNICODE);
13786 
13787     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13788     assert((maxchar > writer->maxchar && length >= 0)
13789            || length > 0);
13790 
13791     if (length > PY_SSIZE_T_MAX - writer->pos) {
13792         PyErr_NoMemory();
13793         return -1;
13794     }
13795     newlen = writer->pos + length;
13796 
13797     maxchar = Py_MAX(maxchar, writer->min_char);
13798 
13799     if (writer->buffer == NULL) {
13800         assert(!writer->readonly);
13801         if (writer->overallocate
13802             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13803             /* overallocate to limit the number of realloc() */
13804             newlen += newlen / OVERALLOCATE_FACTOR;
13805         }
13806         if (newlen < writer->min_length)
13807             newlen = writer->min_length;
13808 
13809         writer->buffer = PyUnicode_New(newlen, maxchar);
13810         if (writer->buffer == NULL)
13811             return -1;
13812     }
13813     else if (newlen > writer->size) {
13814         if (writer->overallocate
13815             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13816             /* overallocate to limit the number of realloc() */
13817             newlen += newlen / OVERALLOCATE_FACTOR;
13818         }
13819         if (newlen < writer->min_length)
13820             newlen = writer->min_length;
13821 
13822         if (maxchar > writer->maxchar || writer->readonly) {
13823             /* resize + widen */
13824             maxchar = Py_MAX(maxchar, writer->maxchar);
13825             newbuffer = PyUnicode_New(newlen, maxchar);
13826             if (newbuffer == NULL)
13827                 return -1;
13828             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13829                                           writer->buffer, 0, writer->pos);
13830             Py_DECREF(writer->buffer);
13831             writer->readonly = 0;
13832         }
13833         else {
13834             newbuffer = resize_compact(writer->buffer, newlen);
13835             if (newbuffer == NULL)
13836                 return -1;
13837         }
13838         writer->buffer = newbuffer;
13839     }
13840     else if (maxchar > writer->maxchar) {
13841         assert(!writer->readonly);
13842         newbuffer = PyUnicode_New(writer->size, maxchar);
13843         if (newbuffer == NULL)
13844             return -1;
13845         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13846                                       writer->buffer, 0, writer->pos);
13847         Py_SETREF(writer->buffer, newbuffer);
13848     }
13849     _PyUnicodeWriter_Update(writer);
13850     return 0;
13851 
13852 #undef OVERALLOCATE_FACTOR
13853 }
13854 
13855 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13856 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13857                                      enum PyUnicode_Kind kind)
13858 {
13859     Py_UCS4 maxchar;
13860 
13861     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13862     assert(writer->kind < kind);
13863 
13864     switch (kind)
13865     {
13866     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13867     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13868     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13869     default:
13870         Py_UNREACHABLE();
13871     }
13872 
13873     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13874 }
13875 
13876 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13877 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13878 {
13879     assert(ch <= MAX_UNICODE);
13880     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13881         return -1;
13882     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13883     writer->pos++;
13884     return 0;
13885 }
13886 
13887 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13888 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13889 {
13890     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13891 }
13892 
13893 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13894 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13895 {
13896     Py_UCS4 maxchar;
13897     Py_ssize_t len;
13898 
13899     if (PyUnicode_READY(str) == -1)
13900         return -1;
13901     len = PyUnicode_GET_LENGTH(str);
13902     if (len == 0)
13903         return 0;
13904     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13905     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13906         if (writer->buffer == NULL && !writer->overallocate) {
13907             assert(_PyUnicode_CheckConsistency(str, 1));
13908             writer->readonly = 1;
13909             Py_INCREF(str);
13910             writer->buffer = str;
13911             _PyUnicodeWriter_Update(writer);
13912             writer->pos += len;
13913             return 0;
13914         }
13915         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13916             return -1;
13917     }
13918     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13919                                   str, 0, len);
13920     writer->pos += len;
13921     return 0;
13922 }
13923 
13924 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13925 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13926                                 Py_ssize_t start, Py_ssize_t end)
13927 {
13928     Py_UCS4 maxchar;
13929     Py_ssize_t len;
13930 
13931     if (PyUnicode_READY(str) == -1)
13932         return -1;
13933 
13934     assert(0 <= start);
13935     assert(end <= PyUnicode_GET_LENGTH(str));
13936     assert(start <= end);
13937 
13938     if (end == 0)
13939         return 0;
13940 
13941     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13942         return _PyUnicodeWriter_WriteStr(writer, str);
13943 
13944     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13945         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13946     else
13947         maxchar = writer->maxchar;
13948     len = end - start;
13949 
13950     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13951         return -1;
13952 
13953     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13954                                   str, start, len);
13955     writer->pos += len;
13956     return 0;
13957 }
13958 
13959 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13960 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13961                                   const char *ascii, Py_ssize_t len)
13962 {
13963     if (len == -1)
13964         len = strlen(ascii);
13965 
13966     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13967 
13968     if (writer->buffer == NULL && !writer->overallocate) {
13969         PyObject *str;
13970 
13971         str = _PyUnicode_FromASCII(ascii, len);
13972         if (str == NULL)
13973             return -1;
13974 
13975         writer->readonly = 1;
13976         writer->buffer = str;
13977         _PyUnicodeWriter_Update(writer);
13978         writer->pos += len;
13979         return 0;
13980     }
13981 
13982     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13983         return -1;
13984 
13985     switch (writer->kind)
13986     {
13987     case PyUnicode_1BYTE_KIND:
13988     {
13989         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13990         Py_UCS1 *data = writer->data;
13991 
13992         memcpy(data + writer->pos, str, len);
13993         break;
13994     }
13995     case PyUnicode_2BYTE_KIND:
13996     {
13997         _PyUnicode_CONVERT_BYTES(
13998             Py_UCS1, Py_UCS2,
13999             ascii, ascii + len,
14000             (Py_UCS2 *)writer->data + writer->pos);
14001         break;
14002     }
14003     case PyUnicode_4BYTE_KIND:
14004     {
14005         _PyUnicode_CONVERT_BYTES(
14006             Py_UCS1, Py_UCS4,
14007             ascii, ascii + len,
14008             (Py_UCS4 *)writer->data + writer->pos);
14009         break;
14010     }
14011     default:
14012         Py_UNREACHABLE();
14013     }
14014 
14015     writer->pos += len;
14016     return 0;
14017 }
14018 
14019 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14020 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14021                                    const char *str, Py_ssize_t len)
14022 {
14023     Py_UCS4 maxchar;
14024 
14025     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14026     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14027         return -1;
14028     unicode_write_cstr(writer->buffer, writer->pos, str, len);
14029     writer->pos += len;
14030     return 0;
14031 }
14032 
14033 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14034 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14035 {
14036     PyObject *str;
14037 
14038     if (writer->pos == 0) {
14039         Py_CLEAR(writer->buffer);
14040         _Py_RETURN_UNICODE_EMPTY();
14041     }
14042 
14043     str = writer->buffer;
14044     writer->buffer = NULL;
14045 
14046     if (writer->readonly) {
14047         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14048         return str;
14049     }
14050 
14051     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14052         PyObject *str2;
14053         str2 = resize_compact(str, writer->pos);
14054         if (str2 == NULL) {
14055             Py_DECREF(str);
14056             return NULL;
14057         }
14058         str = str2;
14059     }
14060 
14061     assert(_PyUnicode_CheckConsistency(str, 1));
14062     return unicode_result_ready(str);
14063 }
14064 
14065 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14066 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14067 {
14068     Py_CLEAR(writer->buffer);
14069 }
14070 
14071 #include "stringlib/unicode_format.h"
14072 
14073 PyDoc_STRVAR(format__doc__,
14074              "S.format(*args, **kwargs) -> str\n\
14075 \n\
14076 Return a formatted version of S, using substitutions from args and kwargs.\n\
14077 The substitutions are identified by braces ('{' and '}').");
14078 
14079 PyDoc_STRVAR(format_map__doc__,
14080              "S.format_map(mapping) -> str\n\
14081 \n\
14082 Return a formatted version of S, using substitutions from mapping.\n\
14083 The substitutions are identified by braces ('{' and '}').");
14084 
14085 /*[clinic input]
14086 str.__format__ as unicode___format__
14087 
14088     format_spec: unicode
14089     /
14090 
14091 Return a formatted version of the string as described by format_spec.
14092 [clinic start generated code]*/
14093 
14094 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14095 unicode___format___impl(PyObject *self, PyObject *format_spec)
14096 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14097 {
14098     _PyUnicodeWriter writer;
14099     int ret;
14100 
14101     if (PyUnicode_READY(self) == -1)
14102         return NULL;
14103     _PyUnicodeWriter_Init(&writer);
14104     ret = _PyUnicode_FormatAdvancedWriter(&writer,
14105                                           self, format_spec, 0,
14106                                           PyUnicode_GET_LENGTH(format_spec));
14107     if (ret == -1) {
14108         _PyUnicodeWriter_Dealloc(&writer);
14109         return NULL;
14110     }
14111     return _PyUnicodeWriter_Finish(&writer);
14112 }
14113 
14114 /*[clinic input]
14115 str.__sizeof__ as unicode_sizeof
14116 
14117 Return the size of the string in memory, in bytes.
14118 [clinic start generated code]*/
14119 
14120 static PyObject *
unicode_sizeof_impl(PyObject * self)14121 unicode_sizeof_impl(PyObject *self)
14122 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14123 {
14124     Py_ssize_t size;
14125 
14126     /* If it's a compact object, account for base structure +
14127        character data. */
14128     if (PyUnicode_IS_COMPACT_ASCII(self))
14129         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14130     else if (PyUnicode_IS_COMPACT(self))
14131         size = sizeof(PyCompactUnicodeObject) +
14132             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14133     else {
14134         /* If it is a two-block object, account for base object, and
14135            for character block if present. */
14136         size = sizeof(PyUnicodeObject);
14137         if (_PyUnicode_DATA_ANY(self))
14138             size += (PyUnicode_GET_LENGTH(self) + 1) *
14139                 PyUnicode_KIND(self);
14140     }
14141     /* If the wstr pointer is present, account for it unless it is shared
14142        with the data pointer. Check if the data is not shared. */
14143     if (_PyUnicode_HAS_WSTR_MEMORY(self))
14144         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14145     if (_PyUnicode_HAS_UTF8_MEMORY(self))
14146         size += PyUnicode_UTF8_LENGTH(self) + 1;
14147 
14148     return PyLong_FromSsize_t(size);
14149 }
14150 
14151 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14152 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14153 {
14154     PyObject *copy = _PyUnicode_Copy(v);
14155     if (!copy)
14156         return NULL;
14157     return Py_BuildValue("(N)", copy);
14158 }
14159 
14160 static PyMethodDef unicode_methods[] = {
14161     UNICODE_ENCODE_METHODDEF
14162     UNICODE_REPLACE_METHODDEF
14163     UNICODE_SPLIT_METHODDEF
14164     UNICODE_RSPLIT_METHODDEF
14165     UNICODE_JOIN_METHODDEF
14166     UNICODE_CAPITALIZE_METHODDEF
14167     UNICODE_CASEFOLD_METHODDEF
14168     UNICODE_TITLE_METHODDEF
14169     UNICODE_CENTER_METHODDEF
14170     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14171     UNICODE_EXPANDTABS_METHODDEF
14172     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14173     UNICODE_PARTITION_METHODDEF
14174     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14175     UNICODE_LJUST_METHODDEF
14176     UNICODE_LOWER_METHODDEF
14177     UNICODE_LSTRIP_METHODDEF
14178     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14179     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14180     UNICODE_RJUST_METHODDEF
14181     UNICODE_RSTRIP_METHODDEF
14182     UNICODE_RPARTITION_METHODDEF
14183     UNICODE_SPLITLINES_METHODDEF
14184     UNICODE_STRIP_METHODDEF
14185     UNICODE_SWAPCASE_METHODDEF
14186     UNICODE_TRANSLATE_METHODDEF
14187     UNICODE_UPPER_METHODDEF
14188     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14189     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14190     UNICODE_REMOVEPREFIX_METHODDEF
14191     UNICODE_REMOVESUFFIX_METHODDEF
14192     UNICODE_ISASCII_METHODDEF
14193     UNICODE_ISLOWER_METHODDEF
14194     UNICODE_ISUPPER_METHODDEF
14195     UNICODE_ISTITLE_METHODDEF
14196     UNICODE_ISSPACE_METHODDEF
14197     UNICODE_ISDECIMAL_METHODDEF
14198     UNICODE_ISDIGIT_METHODDEF
14199     UNICODE_ISNUMERIC_METHODDEF
14200     UNICODE_ISALPHA_METHODDEF
14201     UNICODE_ISALNUM_METHODDEF
14202     UNICODE_ISIDENTIFIER_METHODDEF
14203     UNICODE_ISPRINTABLE_METHODDEF
14204     UNICODE_ZFILL_METHODDEF
14205     {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14206     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14207     UNICODE___FORMAT___METHODDEF
14208     UNICODE_MAKETRANS_METHODDEF
14209     UNICODE_SIZEOF_METHODDEF
14210     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14211     {NULL, NULL}
14212 };
14213 
14214 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14215 unicode_mod(PyObject *v, PyObject *w)
14216 {
14217     if (!PyUnicode_Check(v))
14218         Py_RETURN_NOTIMPLEMENTED;
14219     return PyUnicode_Format(v, w);
14220 }
14221 
14222 static PyNumberMethods unicode_as_number = {
14223     0,              /*nb_add*/
14224     0,              /*nb_subtract*/
14225     0,              /*nb_multiply*/
14226     unicode_mod,            /*nb_remainder*/
14227 };
14228 
14229 static PySequenceMethods unicode_as_sequence = {
14230     (lenfunc) unicode_length,       /* sq_length */
14231     PyUnicode_Concat,           /* sq_concat */
14232     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
14233     (ssizeargfunc) unicode_getitem,     /* sq_item */
14234     0,                  /* sq_slice */
14235     0,                  /* sq_ass_item */
14236     0,                  /* sq_ass_slice */
14237     PyUnicode_Contains,         /* sq_contains */
14238 };
14239 
14240 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14241 unicode_subscript(PyObject* self, PyObject* item)
14242 {
14243     if (PyUnicode_READY(self) == -1)
14244         return NULL;
14245 
14246     if (_PyIndex_Check(item)) {
14247         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14248         if (i == -1 && PyErr_Occurred())
14249             return NULL;
14250         if (i < 0)
14251             i += PyUnicode_GET_LENGTH(self);
14252         return unicode_getitem(self, i);
14253     } else if (PySlice_Check(item)) {
14254         Py_ssize_t start, stop, step, slicelength, i;
14255         size_t cur;
14256         PyObject *result;
14257         const void *src_data;
14258         void *dest_data;
14259         int src_kind, dest_kind;
14260         Py_UCS4 ch, max_char, kind_limit;
14261 
14262         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14263             return NULL;
14264         }
14265         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14266                                             &start, &stop, step);
14267 
14268         if (slicelength <= 0) {
14269             _Py_RETURN_UNICODE_EMPTY();
14270         } else if (start == 0 && step == 1 &&
14271                    slicelength == PyUnicode_GET_LENGTH(self)) {
14272             return unicode_result_unchanged(self);
14273         } else if (step == 1) {
14274             return PyUnicode_Substring(self,
14275                                        start, start + slicelength);
14276         }
14277         /* General case */
14278         src_kind = PyUnicode_KIND(self);
14279         src_data = PyUnicode_DATA(self);
14280         if (!PyUnicode_IS_ASCII(self)) {
14281             kind_limit = kind_maxchar_limit(src_kind);
14282             max_char = 0;
14283             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14284                 ch = PyUnicode_READ(src_kind, src_data, cur);
14285                 if (ch > max_char) {
14286                     max_char = ch;
14287                     if (max_char >= kind_limit)
14288                         break;
14289                 }
14290             }
14291         }
14292         else
14293             max_char = 127;
14294         result = PyUnicode_New(slicelength, max_char);
14295         if (result == NULL)
14296             return NULL;
14297         dest_kind = PyUnicode_KIND(result);
14298         dest_data = PyUnicode_DATA(result);
14299 
14300         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14301             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14302             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14303         }
14304         assert(_PyUnicode_CheckConsistency(result, 1));
14305         return result;
14306     } else {
14307         PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14308                      Py_TYPE(item)->tp_name);
14309         return NULL;
14310     }
14311 }
14312 
14313 static PyMappingMethods unicode_as_mapping = {
14314     (lenfunc)unicode_length,        /* mp_length */
14315     (binaryfunc)unicode_subscript,  /* mp_subscript */
14316     (objobjargproc)0,           /* mp_ass_subscript */
14317 };
14318 
14319 
14320 /* Helpers for PyUnicode_Format() */
14321 
14322 struct unicode_formatter_t {
14323     PyObject *args;
14324     int args_owned;
14325     Py_ssize_t arglen, argidx;
14326     PyObject *dict;
14327 
14328     enum PyUnicode_Kind fmtkind;
14329     Py_ssize_t fmtcnt, fmtpos;
14330     const void *fmtdata;
14331     PyObject *fmtstr;
14332 
14333     _PyUnicodeWriter writer;
14334 };
14335 
14336 struct unicode_format_arg_t {
14337     Py_UCS4 ch;
14338     int flags;
14339     Py_ssize_t width;
14340     int prec;
14341     int sign;
14342 };
14343 
14344 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14345 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14346 {
14347     Py_ssize_t argidx = ctx->argidx;
14348 
14349     if (argidx < ctx->arglen) {
14350         ctx->argidx++;
14351         if (ctx->arglen < 0)
14352             return ctx->args;
14353         else
14354             return PyTuple_GetItem(ctx->args, argidx);
14355     }
14356     PyErr_SetString(PyExc_TypeError,
14357                     "not enough arguments for format string");
14358     return NULL;
14359 }
14360 
14361 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14362 
14363 /* Format a float into the writer if the writer is not NULL, or into *p_output
14364    otherwise.
14365 
14366    Return 0 on success, raise an exception and return -1 on error. */
14367 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14368 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14369             PyObject **p_output,
14370             _PyUnicodeWriter *writer)
14371 {
14372     char *p;
14373     double x;
14374     Py_ssize_t len;
14375     int prec;
14376     int dtoa_flags = 0;
14377 
14378     x = PyFloat_AsDouble(v);
14379     if (x == -1.0 && PyErr_Occurred())
14380         return -1;
14381 
14382     prec = arg->prec;
14383     if (prec < 0)
14384         prec = 6;
14385 
14386     if (arg->flags & F_ALT)
14387         dtoa_flags |= Py_DTSF_ALT;
14388     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14389     if (p == NULL)
14390         return -1;
14391     len = strlen(p);
14392     if (writer) {
14393         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14394             PyMem_Free(p);
14395             return -1;
14396         }
14397     }
14398     else
14399         *p_output = _PyUnicode_FromASCII(p, len);
14400     PyMem_Free(p);
14401     return 0;
14402 }
14403 
14404 /* formatlong() emulates the format codes d, u, o, x and X, and
14405  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14406  * Python's regular ints.
14407  * Return value:  a new PyUnicodeObject*, or NULL if error.
14408  *     The output string is of the form
14409  *         "-"? ("0x" | "0X")? digit+
14410  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14411  *         set in flags.  The case of hex digits will be correct,
14412  *     There will be at least prec digits, zero-filled on the left if
14413  *         necessary to get that many.
14414  * val          object to be converted
14415  * flags        bitmask of format flags; only F_ALT is looked at
14416  * prec         minimum number of digits; 0-fill on left if needed
14417  * type         a character in [duoxX]; u acts the same as d
14418  *
14419  * CAUTION:  o, x and X conversions on regular ints can never
14420  * produce a '-' sign, but can for Python's unbounded ints.
14421  */
14422 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14423 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14424 {
14425     PyObject *result = NULL;
14426     char *buf;
14427     Py_ssize_t i;
14428     int sign;           /* 1 if '-', else 0 */
14429     int len;            /* number of characters */
14430     Py_ssize_t llen;
14431     int numdigits;      /* len == numnondigits + numdigits */
14432     int numnondigits = 0;
14433 
14434     /* Avoid exceeding SSIZE_T_MAX */
14435     if (prec > INT_MAX-3) {
14436         PyErr_SetString(PyExc_OverflowError,
14437                         "precision too large");
14438         return NULL;
14439     }
14440 
14441     assert(PyLong_Check(val));
14442 
14443     switch (type) {
14444     default:
14445         Py_UNREACHABLE();
14446     case 'd':
14447     case 'i':
14448     case 'u':
14449         /* int and int subclasses should print numerically when a numeric */
14450         /* format code is used (see issue18780) */
14451         result = PyNumber_ToBase(val, 10);
14452         break;
14453     case 'o':
14454         numnondigits = 2;
14455         result = PyNumber_ToBase(val, 8);
14456         break;
14457     case 'x':
14458     case 'X':
14459         numnondigits = 2;
14460         result = PyNumber_ToBase(val, 16);
14461         break;
14462     }
14463     if (!result)
14464         return NULL;
14465 
14466     assert(unicode_modifiable(result));
14467     assert(PyUnicode_IS_READY(result));
14468     assert(PyUnicode_IS_ASCII(result));
14469 
14470     /* To modify the string in-place, there can only be one reference. */
14471     if (Py_REFCNT(result) != 1) {
14472         Py_DECREF(result);
14473         PyErr_BadInternalCall();
14474         return NULL;
14475     }
14476     buf = PyUnicode_DATA(result);
14477     llen = PyUnicode_GET_LENGTH(result);
14478     if (llen > INT_MAX) {
14479         Py_DECREF(result);
14480         PyErr_SetString(PyExc_ValueError,
14481                         "string too large in _PyUnicode_FormatLong");
14482         return NULL;
14483     }
14484     len = (int)llen;
14485     sign = buf[0] == '-';
14486     numnondigits += sign;
14487     numdigits = len - numnondigits;
14488     assert(numdigits > 0);
14489 
14490     /* Get rid of base marker unless F_ALT */
14491     if (((alt) == 0 &&
14492         (type == 'o' || type == 'x' || type == 'X'))) {
14493         assert(buf[sign] == '0');
14494         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14495                buf[sign+1] == 'o');
14496         numnondigits -= 2;
14497         buf += 2;
14498         len -= 2;
14499         if (sign)
14500             buf[0] = '-';
14501         assert(len == numnondigits + numdigits);
14502         assert(numdigits > 0);
14503     }
14504 
14505     /* Fill with leading zeroes to meet minimum width. */
14506     if (prec > numdigits) {
14507         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14508                                 numnondigits + prec);
14509         char *b1;
14510         if (!r1) {
14511             Py_DECREF(result);
14512             return NULL;
14513         }
14514         b1 = PyBytes_AS_STRING(r1);
14515         for (i = 0; i < numnondigits; ++i)
14516             *b1++ = *buf++;
14517         for (i = 0; i < prec - numdigits; i++)
14518             *b1++ = '0';
14519         for (i = 0; i < numdigits; i++)
14520             *b1++ = *buf++;
14521         *b1 = '\0';
14522         Py_DECREF(result);
14523         result = r1;
14524         buf = PyBytes_AS_STRING(result);
14525         len = numnondigits + prec;
14526     }
14527 
14528     /* Fix up case for hex conversions. */
14529     if (type == 'X') {
14530         /* Need to convert all lower case letters to upper case.
14531            and need to convert 0x to 0X (and -0x to -0X). */
14532         for (i = 0; i < len; i++)
14533             if (buf[i] >= 'a' && buf[i] <= 'x')
14534                 buf[i] -= 'a'-'A';
14535     }
14536     if (!PyUnicode_Check(result)
14537         || buf != PyUnicode_DATA(result)) {
14538         PyObject *unicode;
14539         unicode = _PyUnicode_FromASCII(buf, len);
14540         Py_DECREF(result);
14541         result = unicode;
14542     }
14543     else if (len != PyUnicode_GET_LENGTH(result)) {
14544         if (PyUnicode_Resize(&result, len) < 0)
14545             Py_CLEAR(result);
14546     }
14547     return result;
14548 }
14549 
14550 /* Format an integer or a float as an integer.
14551  * Return 1 if the number has been formatted into the writer,
14552  *        0 if the number has been formatted into *p_output
14553  *       -1 and raise an exception on error */
14554 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14555 mainformatlong(PyObject *v,
14556                struct unicode_format_arg_t *arg,
14557                PyObject **p_output,
14558                _PyUnicodeWriter *writer)
14559 {
14560     PyObject *iobj, *res;
14561     char type = (char)arg->ch;
14562 
14563     if (!PyNumber_Check(v))
14564         goto wrongtype;
14565 
14566     /* make sure number is a type of integer for o, x, and X */
14567     if (!PyLong_Check(v)) {
14568         if (type == 'o' || type == 'x' || type == 'X') {
14569             iobj = _PyNumber_Index(v);
14570         }
14571         else {
14572             iobj = PyNumber_Long(v);
14573         }
14574         if (iobj == NULL ) {
14575             if (PyErr_ExceptionMatches(PyExc_TypeError))
14576                 goto wrongtype;
14577             return -1;
14578         }
14579         assert(PyLong_Check(iobj));
14580     }
14581     else {
14582         iobj = v;
14583         Py_INCREF(iobj);
14584     }
14585 
14586     if (PyLong_CheckExact(v)
14587         && arg->width == -1 && arg->prec == -1
14588         && !(arg->flags & (F_SIGN | F_BLANK))
14589         && type != 'X')
14590     {
14591         /* Fast path */
14592         int alternate = arg->flags & F_ALT;
14593         int base;
14594 
14595         switch(type)
14596         {
14597             default:
14598                 Py_UNREACHABLE();
14599             case 'd':
14600             case 'i':
14601             case 'u':
14602                 base = 10;
14603                 break;
14604             case 'o':
14605                 base = 8;
14606                 break;
14607             case 'x':
14608             case 'X':
14609                 base = 16;
14610                 break;
14611         }
14612 
14613         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14614             Py_DECREF(iobj);
14615             return -1;
14616         }
14617         Py_DECREF(iobj);
14618         return 1;
14619     }
14620 
14621     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14622     Py_DECREF(iobj);
14623     if (res == NULL)
14624         return -1;
14625     *p_output = res;
14626     return 0;
14627 
14628 wrongtype:
14629     switch(type)
14630     {
14631         case 'o':
14632         case 'x':
14633         case 'X':
14634             PyErr_Format(PyExc_TypeError,
14635                     "%%%c format: an integer is required, "
14636                     "not %.200s",
14637                     type, Py_TYPE(v)->tp_name);
14638             break;
14639         default:
14640             PyErr_Format(PyExc_TypeError,
14641                     "%%%c format: a real number is required, "
14642                     "not %.200s",
14643                     type, Py_TYPE(v)->tp_name);
14644             break;
14645     }
14646     return -1;
14647 }
14648 
14649 static Py_UCS4
formatchar(PyObject * v)14650 formatchar(PyObject *v)
14651 {
14652     /* presume that the buffer is at least 3 characters long */
14653     if (PyUnicode_Check(v)) {
14654         if (PyUnicode_GET_LENGTH(v) == 1) {
14655             return PyUnicode_READ_CHAR(v, 0);
14656         }
14657         goto onError;
14658     }
14659     else {
14660         int overflow;
14661         long x = PyLong_AsLongAndOverflow(v, &overflow);
14662         if (x == -1 && PyErr_Occurred()) {
14663             if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14664                 goto onError;
14665             }
14666             return (Py_UCS4) -1;
14667         }
14668 
14669         if (x < 0 || x > MAX_UNICODE) {
14670             /* this includes an overflow in converting to C long */
14671             PyErr_SetString(PyExc_OverflowError,
14672                             "%c arg not in range(0x110000)");
14673             return (Py_UCS4) -1;
14674         }
14675 
14676         return (Py_UCS4) x;
14677     }
14678 
14679   onError:
14680     PyErr_SetString(PyExc_TypeError,
14681                     "%c requires int or char");
14682     return (Py_UCS4) -1;
14683 }
14684 
14685 /* Parse options of an argument: flags, width, precision.
14686    Handle also "%(name)" syntax.
14687 
14688    Return 0 if the argument has been formatted into arg->str.
14689    Return 1 if the argument has been written into ctx->writer,
14690    Raise an exception and return -1 on error. */
14691 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14692 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14693                          struct unicode_format_arg_t *arg)
14694 {
14695 #define FORMAT_READ(ctx) \
14696         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14697 
14698     PyObject *v;
14699 
14700     if (arg->ch == '(') {
14701         /* Get argument value from a dictionary. Example: "%(name)s". */
14702         Py_ssize_t keystart;
14703         Py_ssize_t keylen;
14704         PyObject *key;
14705         int pcount = 1;
14706 
14707         if (ctx->dict == NULL) {
14708             PyErr_SetString(PyExc_TypeError,
14709                             "format requires a mapping");
14710             return -1;
14711         }
14712         ++ctx->fmtpos;
14713         --ctx->fmtcnt;
14714         keystart = ctx->fmtpos;
14715         /* Skip over balanced parentheses */
14716         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14717             arg->ch = FORMAT_READ(ctx);
14718             if (arg->ch == ')')
14719                 --pcount;
14720             else if (arg->ch == '(')
14721                 ++pcount;
14722             ctx->fmtpos++;
14723         }
14724         keylen = ctx->fmtpos - keystart - 1;
14725         if (ctx->fmtcnt < 0 || pcount > 0) {
14726             PyErr_SetString(PyExc_ValueError,
14727                             "incomplete format key");
14728             return -1;
14729         }
14730         key = PyUnicode_Substring(ctx->fmtstr,
14731                                   keystart, keystart + keylen);
14732         if (key == NULL)
14733             return -1;
14734         if (ctx->args_owned) {
14735             ctx->args_owned = 0;
14736             Py_DECREF(ctx->args);
14737         }
14738         ctx->args = PyObject_GetItem(ctx->dict, key);
14739         Py_DECREF(key);
14740         if (ctx->args == NULL)
14741             return -1;
14742         ctx->args_owned = 1;
14743         ctx->arglen = -1;
14744         ctx->argidx = -2;
14745     }
14746 
14747     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14748     while (--ctx->fmtcnt >= 0) {
14749         arg->ch = FORMAT_READ(ctx);
14750         ctx->fmtpos++;
14751         switch (arg->ch) {
14752         case '-': arg->flags |= F_LJUST; continue;
14753         case '+': arg->flags |= F_SIGN; continue;
14754         case ' ': arg->flags |= F_BLANK; continue;
14755         case '#': arg->flags |= F_ALT; continue;
14756         case '0': arg->flags |= F_ZERO; continue;
14757         }
14758         break;
14759     }
14760 
14761     /* Parse width. Example: "%10s" => width=10 */
14762     if (arg->ch == '*') {
14763         v = unicode_format_getnextarg(ctx);
14764         if (v == NULL)
14765             return -1;
14766         if (!PyLong_Check(v)) {
14767             PyErr_SetString(PyExc_TypeError,
14768                             "* wants int");
14769             return -1;
14770         }
14771         arg->width = PyLong_AsSsize_t(v);
14772         if (arg->width == -1 && PyErr_Occurred())
14773             return -1;
14774         if (arg->width < 0) {
14775             arg->flags |= F_LJUST;
14776             arg->width = -arg->width;
14777         }
14778         if (--ctx->fmtcnt >= 0) {
14779             arg->ch = FORMAT_READ(ctx);
14780             ctx->fmtpos++;
14781         }
14782     }
14783     else if (arg->ch >= '0' && arg->ch <= '9') {
14784         arg->width = arg->ch - '0';
14785         while (--ctx->fmtcnt >= 0) {
14786             arg->ch = FORMAT_READ(ctx);
14787             ctx->fmtpos++;
14788             if (arg->ch < '0' || arg->ch > '9')
14789                 break;
14790             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14791                mixing signed and unsigned comparison. Since arg->ch is between
14792                '0' and '9', casting to int is safe. */
14793             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14794                 PyErr_SetString(PyExc_ValueError,
14795                                 "width too big");
14796                 return -1;
14797             }
14798             arg->width = arg->width*10 + (arg->ch - '0');
14799         }
14800     }
14801 
14802     /* Parse precision. Example: "%.3f" => prec=3 */
14803     if (arg->ch == '.') {
14804         arg->prec = 0;
14805         if (--ctx->fmtcnt >= 0) {
14806             arg->ch = FORMAT_READ(ctx);
14807             ctx->fmtpos++;
14808         }
14809         if (arg->ch == '*') {
14810             v = unicode_format_getnextarg(ctx);
14811             if (v == NULL)
14812                 return -1;
14813             if (!PyLong_Check(v)) {
14814                 PyErr_SetString(PyExc_TypeError,
14815                                 "* wants int");
14816                 return -1;
14817             }
14818             arg->prec = _PyLong_AsInt(v);
14819             if (arg->prec == -1 && PyErr_Occurred())
14820                 return -1;
14821             if (arg->prec < 0)
14822                 arg->prec = 0;
14823             if (--ctx->fmtcnt >= 0) {
14824                 arg->ch = FORMAT_READ(ctx);
14825                 ctx->fmtpos++;
14826             }
14827         }
14828         else if (arg->ch >= '0' && arg->ch <= '9') {
14829             arg->prec = arg->ch - '0';
14830             while (--ctx->fmtcnt >= 0) {
14831                 arg->ch = FORMAT_READ(ctx);
14832                 ctx->fmtpos++;
14833                 if (arg->ch < '0' || arg->ch > '9')
14834                     break;
14835                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14836                     PyErr_SetString(PyExc_ValueError,
14837                                     "precision too big");
14838                     return -1;
14839                 }
14840                 arg->prec = arg->prec*10 + (arg->ch - '0');
14841             }
14842         }
14843     }
14844 
14845     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14846     if (ctx->fmtcnt >= 0) {
14847         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14848             if (--ctx->fmtcnt >= 0) {
14849                 arg->ch = FORMAT_READ(ctx);
14850                 ctx->fmtpos++;
14851             }
14852         }
14853     }
14854     if (ctx->fmtcnt < 0) {
14855         PyErr_SetString(PyExc_ValueError,
14856                         "incomplete format");
14857         return -1;
14858     }
14859     return 0;
14860 
14861 #undef FORMAT_READ
14862 }
14863 
14864 /* Format one argument. Supported conversion specifiers:
14865 
14866    - "s", "r", "a": any type
14867    - "i", "d", "u": int or float
14868    - "o", "x", "X": int
14869    - "e", "E", "f", "F", "g", "G": float
14870    - "c": int or str (1 character)
14871 
14872    When possible, the output is written directly into the Unicode writer
14873    (ctx->writer). A string is created when padding is required.
14874 
14875    Return 0 if the argument has been formatted into *p_str,
14876           1 if the argument has been written into ctx->writer,
14877          -1 on error. */
14878 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14879 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14880                           struct unicode_format_arg_t *arg,
14881                           PyObject **p_str)
14882 {
14883     PyObject *v;
14884     _PyUnicodeWriter *writer = &ctx->writer;
14885 
14886     if (ctx->fmtcnt == 0)
14887         ctx->writer.overallocate = 0;
14888 
14889     v = unicode_format_getnextarg(ctx);
14890     if (v == NULL)
14891         return -1;
14892 
14893 
14894     switch (arg->ch) {
14895     case 's':
14896     case 'r':
14897     case 'a':
14898         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14899             /* Fast path */
14900             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14901                 return -1;
14902             return 1;
14903         }
14904 
14905         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14906             *p_str = v;
14907             Py_INCREF(*p_str);
14908         }
14909         else {
14910             if (arg->ch == 's')
14911                 *p_str = PyObject_Str(v);
14912             else if (arg->ch == 'r')
14913                 *p_str = PyObject_Repr(v);
14914             else
14915                 *p_str = PyObject_ASCII(v);
14916         }
14917         break;
14918 
14919     case 'i':
14920     case 'd':
14921     case 'u':
14922     case 'o':
14923     case 'x':
14924     case 'X':
14925     {
14926         int ret = mainformatlong(v, arg, p_str, writer);
14927         if (ret != 0)
14928             return ret;
14929         arg->sign = 1;
14930         break;
14931     }
14932 
14933     case 'e':
14934     case 'E':
14935     case 'f':
14936     case 'F':
14937     case 'g':
14938     case 'G':
14939         if (arg->width == -1 && arg->prec == -1
14940             && !(arg->flags & (F_SIGN | F_BLANK)))
14941         {
14942             /* Fast path */
14943             if (formatfloat(v, arg, NULL, writer) == -1)
14944                 return -1;
14945             return 1;
14946         }
14947 
14948         arg->sign = 1;
14949         if (formatfloat(v, arg, p_str, NULL) == -1)
14950             return -1;
14951         break;
14952 
14953     case 'c':
14954     {
14955         Py_UCS4 ch = formatchar(v);
14956         if (ch == (Py_UCS4) -1)
14957             return -1;
14958         if (arg->width == -1 && arg->prec == -1) {
14959             /* Fast path */
14960             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14961                 return -1;
14962             return 1;
14963         }
14964         *p_str = PyUnicode_FromOrdinal(ch);
14965         break;
14966     }
14967 
14968     default:
14969         PyErr_Format(PyExc_ValueError,
14970                      "unsupported format character '%c' (0x%x) "
14971                      "at index %zd",
14972                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14973                      (int)arg->ch,
14974                      ctx->fmtpos - 1);
14975         return -1;
14976     }
14977     if (*p_str == NULL)
14978         return -1;
14979     assert (PyUnicode_Check(*p_str));
14980     return 0;
14981 }
14982 
14983 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14984 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14985                           struct unicode_format_arg_t *arg,
14986                           PyObject *str)
14987 {
14988     Py_ssize_t len;
14989     enum PyUnicode_Kind kind;
14990     const void *pbuf;
14991     Py_ssize_t pindex;
14992     Py_UCS4 signchar;
14993     Py_ssize_t buflen;
14994     Py_UCS4 maxchar;
14995     Py_ssize_t sublen;
14996     _PyUnicodeWriter *writer = &ctx->writer;
14997     Py_UCS4 fill;
14998 
14999     fill = ' ';
15000     if (arg->sign && arg->flags & F_ZERO)
15001         fill = '0';
15002 
15003     if (PyUnicode_READY(str) == -1)
15004         return -1;
15005 
15006     len = PyUnicode_GET_LENGTH(str);
15007     if ((arg->width == -1 || arg->width <= len)
15008         && (arg->prec == -1 || arg->prec >= len)
15009         && !(arg->flags & (F_SIGN | F_BLANK)))
15010     {
15011         /* Fast path */
15012         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15013             return -1;
15014         return 0;
15015     }
15016 
15017     /* Truncate the string for "s", "r" and "a" formats
15018        if the precision is set */
15019     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15020         if (arg->prec >= 0 && len > arg->prec)
15021             len = arg->prec;
15022     }
15023 
15024     /* Adjust sign and width */
15025     kind = PyUnicode_KIND(str);
15026     pbuf = PyUnicode_DATA(str);
15027     pindex = 0;
15028     signchar = '\0';
15029     if (arg->sign) {
15030         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15031         if (ch == '-' || ch == '+') {
15032             signchar = ch;
15033             len--;
15034             pindex++;
15035         }
15036         else if (arg->flags & F_SIGN)
15037             signchar = '+';
15038         else if (arg->flags & F_BLANK)
15039             signchar = ' ';
15040         else
15041             arg->sign = 0;
15042     }
15043     if (arg->width < len)
15044         arg->width = len;
15045 
15046     /* Prepare the writer */
15047     maxchar = writer->maxchar;
15048     if (!(arg->flags & F_LJUST)) {
15049         if (arg->sign) {
15050             if ((arg->width-1) > len)
15051                 maxchar = Py_MAX(maxchar, fill);
15052         }
15053         else {
15054             if (arg->width > len)
15055                 maxchar = Py_MAX(maxchar, fill);
15056         }
15057     }
15058     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15059         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15060         maxchar = Py_MAX(maxchar, strmaxchar);
15061     }
15062 
15063     buflen = arg->width;
15064     if (arg->sign && len == arg->width)
15065         buflen++;
15066     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15067         return -1;
15068 
15069     /* Write the sign if needed */
15070     if (arg->sign) {
15071         if (fill != ' ') {
15072             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15073             writer->pos += 1;
15074         }
15075         if (arg->width > len)
15076             arg->width--;
15077     }
15078 
15079     /* Write the numeric prefix for "x", "X" and "o" formats
15080        if the alternate form is used.
15081        For example, write "0x" for the "%#x" format. */
15082     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15083         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15084         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15085         if (fill != ' ') {
15086             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15087             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15088             writer->pos += 2;
15089             pindex += 2;
15090         }
15091         arg->width -= 2;
15092         if (arg->width < 0)
15093             arg->width = 0;
15094         len -= 2;
15095     }
15096 
15097     /* Pad left with the fill character if needed */
15098     if (arg->width > len && !(arg->flags & F_LJUST)) {
15099         sublen = arg->width - len;
15100         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15101         writer->pos += sublen;
15102         arg->width = len;
15103     }
15104 
15105     /* If padding with spaces: write sign if needed and/or numeric prefix if
15106        the alternate form is used */
15107     if (fill == ' ') {
15108         if (arg->sign) {
15109             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15110             writer->pos += 1;
15111         }
15112         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15113             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15114             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15115             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15116             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15117             writer->pos += 2;
15118             pindex += 2;
15119         }
15120     }
15121 
15122     /* Write characters */
15123     if (len) {
15124         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15125                                       str, pindex, len);
15126         writer->pos += len;
15127     }
15128 
15129     /* Pad right with the fill character if needed */
15130     if (arg->width > len) {
15131         sublen = arg->width - len;
15132         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15133         writer->pos += sublen;
15134     }
15135     return 0;
15136 }
15137 
15138 /* Helper of PyUnicode_Format(): format one arg.
15139    Return 0 on success, raise an exception and return -1 on error. */
15140 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15141 unicode_format_arg(struct unicode_formatter_t *ctx)
15142 {
15143     struct unicode_format_arg_t arg;
15144     PyObject *str;
15145     int ret;
15146 
15147     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15148     if (arg.ch == '%') {
15149         ctx->fmtpos++;
15150         ctx->fmtcnt--;
15151         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15152             return -1;
15153         return 0;
15154     }
15155     arg.flags = 0;
15156     arg.width = -1;
15157     arg.prec = -1;
15158     arg.sign = 0;
15159     str = NULL;
15160 
15161     ret = unicode_format_arg_parse(ctx, &arg);
15162     if (ret == -1)
15163         return -1;
15164 
15165     ret = unicode_format_arg_format(ctx, &arg, &str);
15166     if (ret == -1)
15167         return -1;
15168 
15169     if (ret != 1) {
15170         ret = unicode_format_arg_output(ctx, &arg, str);
15171         Py_DECREF(str);
15172         if (ret == -1)
15173             return -1;
15174     }
15175 
15176     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15177         PyErr_SetString(PyExc_TypeError,
15178                         "not all arguments converted during string formatting");
15179         return -1;
15180     }
15181     return 0;
15182 }
15183 
15184 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15185 PyUnicode_Format(PyObject *format, PyObject *args)
15186 {
15187     struct unicode_formatter_t ctx;
15188 
15189     if (format == NULL || args == NULL) {
15190         PyErr_BadInternalCall();
15191         return NULL;
15192     }
15193 
15194     if (ensure_unicode(format) < 0)
15195         return NULL;
15196 
15197     ctx.fmtstr = format;
15198     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15199     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15200     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15201     ctx.fmtpos = 0;
15202 
15203     _PyUnicodeWriter_Init(&ctx.writer);
15204     ctx.writer.min_length = ctx.fmtcnt + 100;
15205     ctx.writer.overallocate = 1;
15206 
15207     if (PyTuple_Check(args)) {
15208         ctx.arglen = PyTuple_Size(args);
15209         ctx.argidx = 0;
15210     }
15211     else {
15212         ctx.arglen = -1;
15213         ctx.argidx = -2;
15214     }
15215     ctx.args_owned = 0;
15216     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15217         ctx.dict = args;
15218     else
15219         ctx.dict = NULL;
15220     ctx.args = args;
15221 
15222     while (--ctx.fmtcnt >= 0) {
15223         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15224             Py_ssize_t nonfmtpos;
15225 
15226             nonfmtpos = ctx.fmtpos++;
15227             while (ctx.fmtcnt >= 0 &&
15228                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15229                 ctx.fmtpos++;
15230                 ctx.fmtcnt--;
15231             }
15232             if (ctx.fmtcnt < 0) {
15233                 ctx.fmtpos--;
15234                 ctx.writer.overallocate = 0;
15235             }
15236 
15237             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15238                                                 nonfmtpos, ctx.fmtpos) < 0)
15239                 goto onError;
15240         }
15241         else {
15242             ctx.fmtpos++;
15243             if (unicode_format_arg(&ctx) == -1)
15244                 goto onError;
15245         }
15246     }
15247 
15248     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15249         PyErr_SetString(PyExc_TypeError,
15250                         "not all arguments converted during string formatting");
15251         goto onError;
15252     }
15253 
15254     if (ctx.args_owned) {
15255         Py_DECREF(ctx.args);
15256     }
15257     return _PyUnicodeWriter_Finish(&ctx.writer);
15258 
15259   onError:
15260     _PyUnicodeWriter_Dealloc(&ctx.writer);
15261     if (ctx.args_owned) {
15262         Py_DECREF(ctx.args);
15263     }
15264     return NULL;
15265 }
15266 
15267 static PyObject *
15268 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15269 
15270 /*[clinic input]
15271 @classmethod
15272 str.__new__ as unicode_new
15273 
15274     object as x: object = NULL
15275     encoding: str = NULL
15276     errors: str = NULL
15277 
15278 [clinic start generated code]*/
15279 
15280 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15281 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15282                  const char *errors)
15283 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15284 {
15285     PyObject *unicode;
15286     if (x == NULL) {
15287         unicode = unicode_new_empty();
15288     }
15289     else if (encoding == NULL && errors == NULL) {
15290         unicode = PyObject_Str(x);
15291     }
15292     else {
15293         unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15294     }
15295 
15296     if (unicode != NULL && type != &PyUnicode_Type) {
15297         Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15298     }
15299     return unicode;
15300 }
15301 
15302 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15303 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15304 {
15305     PyObject *self;
15306     Py_ssize_t length, char_size;
15307     int share_wstr, share_utf8;
15308     unsigned int kind;
15309     void *data;
15310 
15311     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15312     assert(_PyUnicode_CHECK(unicode));
15313     if (PyUnicode_READY(unicode) == -1) {
15314         return NULL;
15315     }
15316 
15317     self = type->tp_alloc(type, 0);
15318     if (self == NULL) {
15319         return NULL;
15320     }
15321     kind = PyUnicode_KIND(unicode);
15322     length = PyUnicode_GET_LENGTH(unicode);
15323 
15324     _PyUnicode_LENGTH(self) = length;
15325 #ifdef Py_DEBUG
15326     _PyUnicode_HASH(self) = -1;
15327 #else
15328     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15329 #endif
15330     _PyUnicode_STATE(self).interned = 0;
15331     _PyUnicode_STATE(self).kind = kind;
15332     _PyUnicode_STATE(self).compact = 0;
15333     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15334     _PyUnicode_STATE(self).ready = 1;
15335     _PyUnicode_WSTR(self) = NULL;
15336     _PyUnicode_UTF8_LENGTH(self) = 0;
15337     _PyUnicode_UTF8(self) = NULL;
15338     _PyUnicode_WSTR_LENGTH(self) = 0;
15339     _PyUnicode_DATA_ANY(self) = NULL;
15340 
15341     share_utf8 = 0;
15342     share_wstr = 0;
15343     if (kind == PyUnicode_1BYTE_KIND) {
15344         char_size = 1;
15345         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15346             share_utf8 = 1;
15347     }
15348     else if (kind == PyUnicode_2BYTE_KIND) {
15349         char_size = 2;
15350         if (sizeof(wchar_t) == 2)
15351             share_wstr = 1;
15352     }
15353     else {
15354         assert(kind == PyUnicode_4BYTE_KIND);
15355         char_size = 4;
15356         if (sizeof(wchar_t) == 4)
15357             share_wstr = 1;
15358     }
15359 
15360     /* Ensure we won't overflow the length. */
15361     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15362         PyErr_NoMemory();
15363         goto onError;
15364     }
15365     data = PyObject_Malloc((length + 1) * char_size);
15366     if (data == NULL) {
15367         PyErr_NoMemory();
15368         goto onError;
15369     }
15370 
15371     _PyUnicode_DATA_ANY(self) = data;
15372     if (share_utf8) {
15373         _PyUnicode_UTF8_LENGTH(self) = length;
15374         _PyUnicode_UTF8(self) = data;
15375     }
15376     if (share_wstr) {
15377         _PyUnicode_WSTR_LENGTH(self) = length;
15378         _PyUnicode_WSTR(self) = (wchar_t *)data;
15379     }
15380 
15381     memcpy(data, PyUnicode_DATA(unicode),
15382               kind * (length + 1));
15383     assert(_PyUnicode_CheckConsistency(self, 1));
15384 #ifdef Py_DEBUG
15385     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15386 #endif
15387     return self;
15388 
15389 onError:
15390     Py_DECREF(self);
15391     return NULL;
15392 }
15393 
15394 void
_PyUnicode_ExactDealloc(PyObject * op)15395 _PyUnicode_ExactDealloc(PyObject *op)
15396 {
15397     assert(PyUnicode_CheckExact(op));
15398     unicode_dealloc(op);
15399 }
15400 
15401 PyDoc_STRVAR(unicode_doc,
15402 "str(object='') -> str\n\
15403 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15404 \n\
15405 Create a new string object from the given object. If encoding or\n\
15406 errors is specified, then the object must expose a data buffer\n\
15407 that will be decoded using the given encoding and error handler.\n\
15408 Otherwise, returns the result of object.__str__() (if defined)\n\
15409 or repr(object).\n\
15410 encoding defaults to sys.getdefaultencoding().\n\
15411 errors defaults to 'strict'.");
15412 
15413 static PyObject *unicode_iter(PyObject *seq);
15414 
15415 PyTypeObject PyUnicode_Type = {
15416     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15417     "str",                        /* tp_name */
15418     sizeof(PyUnicodeObject),      /* tp_basicsize */
15419     0,                            /* tp_itemsize */
15420     /* Slots */
15421     (destructor)unicode_dealloc,  /* tp_dealloc */
15422     0,                            /* tp_vectorcall_offset */
15423     0,                            /* tp_getattr */
15424     0,                            /* tp_setattr */
15425     0,                            /* tp_as_async */
15426     unicode_repr,                 /* tp_repr */
15427     &unicode_as_number,           /* tp_as_number */
15428     &unicode_as_sequence,         /* tp_as_sequence */
15429     &unicode_as_mapping,          /* tp_as_mapping */
15430     (hashfunc) unicode_hash,      /* tp_hash*/
15431     0,                            /* tp_call*/
15432     (reprfunc) unicode_str,       /* tp_str */
15433     PyObject_GenericGetAttr,      /* tp_getattro */
15434     0,                            /* tp_setattro */
15435     0,                            /* tp_as_buffer */
15436     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15437         Py_TPFLAGS_UNICODE_SUBCLASS |
15438         _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15439     unicode_doc,                  /* tp_doc */
15440     0,                            /* tp_traverse */
15441     0,                            /* tp_clear */
15442     PyUnicode_RichCompare,        /* tp_richcompare */
15443     0,                            /* tp_weaklistoffset */
15444     unicode_iter,                 /* tp_iter */
15445     0,                            /* tp_iternext */
15446     unicode_methods,              /* tp_methods */
15447     0,                            /* tp_members */
15448     0,                            /* tp_getset */
15449     0,                            /* tp_base */
15450     0,                            /* tp_dict */
15451     0,                            /* tp_descr_get */
15452     0,                            /* tp_descr_set */
15453     0,                            /* tp_dictoffset */
15454     0,                            /* tp_init */
15455     0,                            /* tp_alloc */
15456     unicode_new,                  /* tp_new */
15457     PyObject_Del,                 /* tp_free */
15458 };
15459 
15460 /* Initialize the Unicode implementation */
15461 
15462 void
_PyUnicode_InitState(PyInterpreterState * interp)15463 _PyUnicode_InitState(PyInterpreterState *interp)
15464 {
15465     if (!_Py_IsMainInterpreter(interp)) {
15466         return;
15467     }
15468 
15469     /* initialize the linebreak bloom filter */
15470     const Py_UCS2 linebreak[] = {
15471         0x000A, /* LINE FEED */
15472         0x000D, /* CARRIAGE RETURN */
15473         0x001C, /* FILE SEPARATOR */
15474         0x001D, /* GROUP SEPARATOR */
15475         0x001E, /* RECORD SEPARATOR */
15476         0x0085, /* NEXT LINE */
15477         0x2028, /* LINE SEPARATOR */
15478         0x2029, /* PARAGRAPH SEPARATOR */
15479     };
15480     bloom_linebreak = make_bloom_mask(
15481         PyUnicode_2BYTE_KIND, linebreak,
15482         Py_ARRAY_LENGTH(linebreak));
15483 }
15484 
15485 
15486 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15487 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15488 {
15489     if (!_Py_IsMainInterpreter(interp)) {
15490         return _PyStatus_OK();
15491     }
15492 
15493 #ifdef Py_DEBUG
15494     assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15495 
15496     for (int i = 0; i < 256; i++) {
15497         assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15498     }
15499 #endif
15500 
15501     return _PyStatus_OK();
15502 }
15503 
15504 
15505 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15506 _PyUnicode_InitTypes(PyInterpreterState *interp)
15507 {
15508     if (!_Py_IsMainInterpreter(interp)) {
15509         return _PyStatus_OK();
15510     }
15511 
15512     if (PyType_Ready(&EncodingMapType) < 0) {
15513         goto error;
15514     }
15515     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15516         goto error;
15517     }
15518     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15519         goto error;
15520     }
15521     return _PyStatus_OK();
15522 
15523 error:
15524     return _PyStatus_ERR("Can't initialize unicode types");
15525 }
15526 
15527 
15528 void
PyUnicode_InternInPlace(PyObject ** p)15529 PyUnicode_InternInPlace(PyObject **p)
15530 {
15531     PyObject *s = *p;
15532 #ifdef Py_DEBUG
15533     assert(s != NULL);
15534     assert(_PyUnicode_CHECK(s));
15535 #else
15536     if (s == NULL || !PyUnicode_Check(s)) {
15537         return;
15538     }
15539 #endif
15540 
15541     /* If it's a subclass, we don't really know what putting
15542        it in the interned dict might do. */
15543     if (!PyUnicode_CheckExact(s)) {
15544         return;
15545     }
15546 
15547     if (PyUnicode_CHECK_INTERNED(s)) {
15548         return;
15549     }
15550 
15551     if (PyUnicode_READY(s) == -1) {
15552         PyErr_Clear();
15553         return;
15554     }
15555 
15556     if (interned == NULL) {
15557         interned = PyDict_New();
15558         if (interned == NULL) {
15559             PyErr_Clear(); /* Don't leave an exception */
15560             return;
15561         }
15562     }
15563 
15564     PyObject *t = PyDict_SetDefault(interned, s, s);
15565     if (t == NULL) {
15566         PyErr_Clear();
15567         return;
15568     }
15569 
15570     if (t != s) {
15571         Py_INCREF(t);
15572         Py_SETREF(*p, t);
15573         return;
15574     }
15575 
15576     /* The two references in interned dict (key and value) are not counted by
15577        refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15578        this. */
15579     Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15580     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15581 }
15582 
15583 void
PyUnicode_InternImmortal(PyObject ** p)15584 PyUnicode_InternImmortal(PyObject **p)
15585 {
15586     if (PyErr_WarnEx(PyExc_DeprecationWarning,
15587             "PyUnicode_InternImmortal() is deprecated; "
15588             "use PyUnicode_InternInPlace() instead", 1) < 0)
15589     {
15590         // The function has no return value, the exception cannot
15591         // be reported to the caller, so just log it.
15592         PyErr_WriteUnraisable(NULL);
15593     }
15594 
15595     PyUnicode_InternInPlace(p);
15596     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15597         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15598         Py_INCREF(*p);
15599     }
15600 }
15601 
15602 PyObject *
PyUnicode_InternFromString(const char * cp)15603 PyUnicode_InternFromString(const char *cp)
15604 {
15605     PyObject *s = PyUnicode_FromString(cp);
15606     if (s == NULL)
15607         return NULL;
15608     PyUnicode_InternInPlace(&s);
15609     return s;
15610 }
15611 
15612 
15613 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15614 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15615 {
15616     if (!_Py_IsMainInterpreter(interp)) {
15617         // interned dict is shared by all interpreters
15618         return;
15619     }
15620 
15621     if (interned == NULL) {
15622         return;
15623     }
15624     assert(PyDict_CheckExact(interned));
15625 
15626     /* Interned unicode strings are not forcibly deallocated; rather, we give
15627        them their stolen references back, and then clear and DECREF the
15628        interned dict. */
15629 
15630 #ifdef INTERNED_STATS
15631     fprintf(stderr, "releasing %zd interned strings\n",
15632             PyDict_GET_SIZE(interned));
15633 
15634     Py_ssize_t immortal_size = 0, mortal_size = 0;
15635 #endif
15636     Py_ssize_t pos = 0;
15637     PyObject *s, *ignored_value;
15638     while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15639         assert(PyUnicode_IS_READY(s));
15640 
15641         switch (PyUnicode_CHECK_INTERNED(s)) {
15642         case SSTATE_INTERNED_IMMORTAL:
15643             Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15644 #ifdef INTERNED_STATS
15645             immortal_size += PyUnicode_GET_LENGTH(s);
15646 #endif
15647             break;
15648         case SSTATE_INTERNED_MORTAL:
15649             // Restore the two references (key and value) ignored
15650             // by PyUnicode_InternInPlace().
15651             Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15652 #ifdef INTERNED_STATS
15653             mortal_size += PyUnicode_GET_LENGTH(s);
15654 #endif
15655             break;
15656         case SSTATE_NOT_INTERNED:
15657             /* fall through */
15658         default:
15659             Py_UNREACHABLE();
15660         }
15661         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15662     }
15663 #ifdef INTERNED_STATS
15664     fprintf(stderr,
15665             "total size of all interned strings: %zd/%zd mortal/immortal\n",
15666             mortal_size, immortal_size);
15667 #endif
15668 
15669     PyDict_Clear(interned);
15670     Py_CLEAR(interned);
15671 }
15672 
15673 
15674 /********************* Unicode Iterator **************************/
15675 
15676 typedef struct {
15677     PyObject_HEAD
15678     Py_ssize_t it_index;
15679     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15680 } unicodeiterobject;
15681 
15682 static void
unicodeiter_dealloc(unicodeiterobject * it)15683 unicodeiter_dealloc(unicodeiterobject *it)
15684 {
15685     _PyObject_GC_UNTRACK(it);
15686     Py_XDECREF(it->it_seq);
15687     PyObject_GC_Del(it);
15688 }
15689 
15690 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15691 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15692 {
15693     Py_VISIT(it->it_seq);
15694     return 0;
15695 }
15696 
15697 static PyObject *
unicodeiter_next(unicodeiterobject * it)15698 unicodeiter_next(unicodeiterobject *it)
15699 {
15700     PyObject *seq;
15701 
15702     assert(it != NULL);
15703     seq = it->it_seq;
15704     if (seq == NULL)
15705         return NULL;
15706     assert(_PyUnicode_CHECK(seq));
15707 
15708     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15709         int kind = PyUnicode_KIND(seq);
15710         const void *data = PyUnicode_DATA(seq);
15711         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15712         it->it_index++;
15713         return unicode_char(chr);
15714     }
15715 
15716     it->it_seq = NULL;
15717     Py_DECREF(seq);
15718     return NULL;
15719 }
15720 
15721 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15722 unicode_ascii_iter_next(unicodeiterobject *it)
15723 {
15724     assert(it != NULL);
15725     PyObject *seq = it->it_seq;
15726     if (seq == NULL) {
15727         return NULL;
15728     }
15729     assert(_PyUnicode_CHECK(seq));
15730     assert(PyUnicode_IS_COMPACT_ASCII(seq));
15731     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15732         const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15733         Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15734                                               data, it->it_index);
15735         it->it_index++;
15736         PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15737         return Py_NewRef(item);
15738     }
15739     it->it_seq = NULL;
15740     Py_DECREF(seq);
15741     return NULL;
15742 }
15743 
15744 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15745 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15746 {
15747     Py_ssize_t len = 0;
15748     if (it->it_seq)
15749         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15750     return PyLong_FromSsize_t(len);
15751 }
15752 
15753 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15754 
15755 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15756 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15757 {
15758     PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15759 
15760     /* _PyEval_GetBuiltin can invoke arbitrary code,
15761      * call must be before access of iterator pointers.
15762      * see issue #101765 */
15763 
15764     if (it->it_seq != NULL) {
15765         return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15766     } else {
15767         PyObject *u = (PyObject *)_PyUnicode_New(0);
15768         if (u == NULL) {
15769             Py_XDECREF(iter);
15770             return NULL;
15771         }
15772         return Py_BuildValue("N(N)", iter, u);
15773     }
15774 }
15775 
15776 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15777 
15778 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15779 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15780 {
15781     Py_ssize_t index = PyLong_AsSsize_t(state);
15782     if (index == -1 && PyErr_Occurred())
15783         return NULL;
15784     if (it->it_seq != NULL) {
15785         if (index < 0)
15786             index = 0;
15787         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15788             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15789         it->it_index = index;
15790     }
15791     Py_RETURN_NONE;
15792 }
15793 
15794 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15795 
15796 static PyMethodDef unicodeiter_methods[] = {
15797     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15798      length_hint_doc},
15799     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15800      reduce_doc},
15801     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15802      setstate_doc},
15803     {NULL,      NULL}       /* sentinel */
15804 };
15805 
15806 PyTypeObject PyUnicodeIter_Type = {
15807     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15808     "str_iterator",         /* tp_name */
15809     sizeof(unicodeiterobject),      /* tp_basicsize */
15810     0,                  /* tp_itemsize */
15811     /* methods */
15812     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15813     0,                  /* tp_vectorcall_offset */
15814     0,                  /* tp_getattr */
15815     0,                  /* tp_setattr */
15816     0,                  /* tp_as_async */
15817     0,                  /* tp_repr */
15818     0,                  /* tp_as_number */
15819     0,                  /* tp_as_sequence */
15820     0,                  /* tp_as_mapping */
15821     0,                  /* tp_hash */
15822     0,                  /* tp_call */
15823     0,                  /* tp_str */
15824     PyObject_GenericGetAttr,        /* tp_getattro */
15825     0,                  /* tp_setattro */
15826     0,                  /* tp_as_buffer */
15827     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15828     0,                  /* tp_doc */
15829     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15830     0,                  /* tp_clear */
15831     0,                  /* tp_richcompare */
15832     0,                  /* tp_weaklistoffset */
15833     PyObject_SelfIter,          /* tp_iter */
15834     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15835     unicodeiter_methods,            /* tp_methods */
15836     0,
15837 };
15838 
15839 PyTypeObject _PyUnicodeASCIIIter_Type = {
15840     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15841     .tp_name = "str_ascii_iterator",
15842     .tp_basicsize = sizeof(unicodeiterobject),
15843     .tp_dealloc = (destructor)unicodeiter_dealloc,
15844     .tp_getattro = PyObject_GenericGetAttr,
15845     .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15846     .tp_traverse = (traverseproc)unicodeiter_traverse,
15847     .tp_iter = PyObject_SelfIter,
15848     .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15849     .tp_methods = unicodeiter_methods,
15850 };
15851 
15852 static PyObject *
unicode_iter(PyObject * seq)15853 unicode_iter(PyObject *seq)
15854 {
15855     unicodeiterobject *it;
15856 
15857     if (!PyUnicode_Check(seq)) {
15858         PyErr_BadInternalCall();
15859         return NULL;
15860     }
15861     if (PyUnicode_READY(seq) == -1)
15862         return NULL;
15863     if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15864         it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15865     }
15866     else {
15867         it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15868     }
15869     if (it == NULL)
15870         return NULL;
15871     it->it_index = 0;
15872     Py_INCREF(seq);
15873     it->it_seq = seq;
15874     _PyObject_GC_TRACK(it);
15875     return (PyObject *)it;
15876 }
15877 
15878 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15879 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15880 {
15881     int res;
15882     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15883     if (res == -2) {
15884         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15885         return -1;
15886     }
15887     if (res < 0) {
15888         PyErr_NoMemory();
15889         return -1;
15890     }
15891     return 0;
15892 }
15893 
15894 
15895 static int
config_get_codec_name(wchar_t ** config_encoding)15896 config_get_codec_name(wchar_t **config_encoding)
15897 {
15898     char *encoding;
15899     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15900         return -1;
15901     }
15902 
15903     PyObject *name_obj = NULL;
15904     PyObject *codec = _PyCodec_Lookup(encoding);
15905     PyMem_RawFree(encoding);
15906 
15907     if (!codec)
15908         goto error;
15909 
15910     name_obj = PyObject_GetAttrString(codec, "name");
15911     Py_CLEAR(codec);
15912     if (!name_obj) {
15913         goto error;
15914     }
15915 
15916     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15917     Py_DECREF(name_obj);
15918     if (wname == NULL) {
15919         goto error;
15920     }
15921 
15922     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15923     if (raw_wname == NULL) {
15924         PyMem_Free(wname);
15925         PyErr_NoMemory();
15926         goto error;
15927     }
15928 
15929     PyMem_RawFree(*config_encoding);
15930     *config_encoding = raw_wname;
15931 
15932     PyMem_Free(wname);
15933     return 0;
15934 
15935 error:
15936     Py_XDECREF(codec);
15937     Py_XDECREF(name_obj);
15938     return -1;
15939 }
15940 
15941 
15942 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15943 init_stdio_encoding(PyInterpreterState *interp)
15944 {
15945     /* Update the stdio encoding to the normalized Python codec name. */
15946     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15947     if (config_get_codec_name(&config->stdio_encoding) < 0) {
15948         return _PyStatus_ERR("failed to get the Python codec name "
15949                              "of the stdio encoding");
15950     }
15951     return _PyStatus_OK();
15952 }
15953 
15954 
15955 static int
init_fs_codec(PyInterpreterState * interp)15956 init_fs_codec(PyInterpreterState *interp)
15957 {
15958     const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15959 
15960     _Py_error_handler error_handler;
15961     error_handler = get_error_handler_wide(config->filesystem_errors);
15962     if (error_handler == _Py_ERROR_UNKNOWN) {
15963         PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15964         return -1;
15965     }
15966 
15967     char *encoding, *errors;
15968     if (encode_wstr_utf8(config->filesystem_encoding,
15969                          &encoding,
15970                          "filesystem_encoding") < 0) {
15971         return -1;
15972     }
15973 
15974     if (encode_wstr_utf8(config->filesystem_errors,
15975                          &errors,
15976                          "filesystem_errors") < 0) {
15977         PyMem_RawFree(encoding);
15978         return -1;
15979     }
15980 
15981     struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15982     PyMem_RawFree(fs_codec->encoding);
15983     fs_codec->encoding = encoding;
15984     /* encoding has been normalized by init_fs_encoding() */
15985     fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15986     PyMem_RawFree(fs_codec->errors);
15987     fs_codec->errors = errors;
15988     fs_codec->error_handler = error_handler;
15989 
15990 #ifdef _Py_FORCE_UTF8_FS_ENCODING
15991     assert(fs_codec->utf8 == 1);
15992 #endif
15993 
15994     /* At this point, PyUnicode_EncodeFSDefault() and
15995        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15996        the C implementation of the filesystem encoding. */
15997 
15998     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15999        global configuration variables. */
16000     if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16001                                   fs_codec->errors) < 0) {
16002         PyErr_NoMemory();
16003         return -1;
16004     }
16005     return 0;
16006 }
16007 
16008 
16009 static PyStatus
init_fs_encoding(PyThreadState * tstate)16010 init_fs_encoding(PyThreadState *tstate)
16011 {
16012     PyInterpreterState *interp = tstate->interp;
16013 
16014     /* Update the filesystem encoding to the normalized Python codec name.
16015        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16016        (Python codec name). */
16017     PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16018     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16019         _Py_DumpPathConfig(tstate);
16020         return _PyStatus_ERR("failed to get the Python codec "
16021                              "of the filesystem encoding");
16022     }
16023 
16024     if (init_fs_codec(interp) < 0) {
16025         return _PyStatus_ERR("cannot initialize filesystem codec");
16026     }
16027     return _PyStatus_OK();
16028 }
16029 
16030 
16031 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16032 _PyUnicode_InitEncodings(PyThreadState *tstate)
16033 {
16034     PyStatus status = init_fs_encoding(tstate);
16035     if (_PyStatus_EXCEPTION(status)) {
16036         return status;
16037     }
16038 
16039     return init_stdio_encoding(tstate->interp);
16040 }
16041 
16042 
16043 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16044 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16045 {
16046     PyMem_RawFree(fs_codec->encoding);
16047     fs_codec->encoding = NULL;
16048     fs_codec->utf8 = 0;
16049     PyMem_RawFree(fs_codec->errors);
16050     fs_codec->errors = NULL;
16051     fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16052 }
16053 
16054 
16055 #ifdef MS_WINDOWS
16056 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16057 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16058 {
16059     PyInterpreterState *interp = _PyInterpreterState_GET();
16060     PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16061 
16062     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16063     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16064     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16065     if (encoding == NULL || errors == NULL) {
16066         PyMem_RawFree(encoding);
16067         PyMem_RawFree(errors);
16068         PyErr_NoMemory();
16069         return -1;
16070     }
16071 
16072     PyMem_RawFree(config->filesystem_encoding);
16073     config->filesystem_encoding = encoding;
16074     PyMem_RawFree(config->filesystem_errors);
16075     config->filesystem_errors = errors;
16076 
16077     return init_fs_codec(interp);
16078 }
16079 #endif
16080 
16081 
16082 #ifdef Py_DEBUG
16083 static inline int
unicode_is_finalizing(void)16084 unicode_is_finalizing(void)
16085 {
16086     return (interned == NULL);
16087 }
16088 #endif
16089 
16090 
16091 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)16092 _PyUnicode_FiniTypes(PyInterpreterState *interp)
16093 {
16094     if (!_Py_IsMainInterpreter(interp)) {
16095         return;
16096     }
16097 
16098     _PyStaticType_Dealloc(&EncodingMapType);
16099     _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16100     _PyStaticType_Dealloc(&PyFormatterIter_Type);
16101 }
16102 
16103 
unicode_static_dealloc(PyObject * op)16104 static void unicode_static_dealloc(PyObject *op)
16105 {
16106     PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16107 
16108     assert(ascii->state.compact);
16109 
16110     if (ascii->state.ascii) {
16111         if (ascii->wstr) {
16112             PyObject_Free(ascii->wstr);
16113             ascii->wstr = NULL;
16114         }
16115     }
16116     else {
16117         PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16118         void* data = (void*)(compact + 1);
16119         if (ascii->wstr && ascii->wstr != data) {
16120             PyObject_Free(ascii->wstr);
16121             ascii->wstr = NULL;
16122             compact->wstr_length = 0;
16123         }
16124         if (compact->utf8) {
16125             PyObject_Free(compact->utf8);
16126             compact->utf8 = NULL;
16127             compact->utf8_length = 0;
16128         }
16129     }
16130 }
16131 
16132 
16133 void
_PyUnicode_Fini(PyInterpreterState * interp)16134 _PyUnicode_Fini(PyInterpreterState *interp)
16135 {
16136     struct _Py_unicode_state *state = &interp->unicode;
16137 
16138     if (_Py_IsMainInterpreter(interp)) {
16139         // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16140         assert(interned == NULL);
16141         // bpo-47182: force a unicodedata CAPI capsule re-import on
16142         // subsequent initialization of main interpreter.
16143         ucnhash_capi = NULL;
16144     }
16145 
16146     _PyUnicode_FiniEncodings(&state->fs_codec);
16147 
16148     unicode_clear_identifiers(state);
16149 
16150     // Clear the single character singletons
16151     for (int i = 0; i < 128; i++) {
16152         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16153     }
16154     for (int i = 0; i < 128; i++) {
16155         unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16156     }
16157 }
16158 
16159 
16160 void
_PyStaticUnicode_Dealloc(PyObject * op)16161 _PyStaticUnicode_Dealloc(PyObject *op)
16162 {
16163     unicode_static_dealloc(op);
16164 }
16165 
16166 
16167 /* A _string module, to export formatter_parser and formatter_field_name_split
16168    to the string.Formatter class implemented in Python. */
16169 
16170 static PyMethodDef _string_methods[] = {
16171     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16172      METH_O, PyDoc_STR("split the argument as a field name")},
16173     {"formatter_parser", (PyCFunction) formatter_parser,
16174      METH_O, PyDoc_STR("parse the argument as a format string")},
16175     {NULL, NULL}
16176 };
16177 
16178 static struct PyModuleDef _string_module = {
16179     PyModuleDef_HEAD_INIT,
16180     .m_name = "_string",
16181     .m_doc = PyDoc_STR("string helper module"),
16182     .m_size = 0,
16183     .m_methods = _string_methods,
16184 };
16185 
16186 PyMODINIT_FUNC
PyInit__string(void)16187 PyInit__string(void)
16188 {
16189     return PyModuleDef_Init(&_string_module);
16190 }
16191 
16192 
16193 #ifdef __cplusplus
16194 }
16195 #endif
16196