1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <[email protected]>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_abstract.h" // _PyIndex_Check()
44 #include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
45 #include "pycore_bytesobject.h" // _PyBytes_Repeat()
46 #include "pycore_bytes_methods.h" // _Py_bytes_lower()
47 #include "pycore_format.h" // F_LJUST
48 #include "pycore_initconfig.h" // _PyStatus_OK()
49 #include "pycore_interp.h" // PyInterpreterState.fs_codec
50 #include "pycore_long.h" // _PyLong_FormatWriter()
51 #include "pycore_object.h" // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
52 #include "pycore_pathconfig.h" // _Py_DumpPathConfig()
53 #include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
54 #include "pycore_pystate.h" // _PyInterpreterState_GET()
55 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
56 #include "pycore_unicodeobject.h" // struct _Py_unicode_state
57 #include "stringlib/eq.h" // unicode_eq()
58
59 #ifdef MS_WINDOWS
60 #include <windows.h>
61 #endif
62
63 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
64 # include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
65 #endif
66
67 /* Uncomment to display statistics on interned strings at exit
68 in _PyUnicode_ClearInterned(). */
69 /* #define INTERNED_STATS 1 */
70
71
72 /*[clinic input]
73 class str "PyObject *" "&PyUnicode_Type"
74 [clinic start generated code]*/
75 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
76
77 /*[python input]
78 class Py_UCS4_converter(CConverter):
79 type = 'Py_UCS4'
80 converter = 'convert_uc'
81
82 def converter_init(self):
83 if self.default is not unspecified:
84 self.c_default = ascii(self.default)
85 if len(self.c_default) > 4 or self.c_default[0] != "'":
86 self.c_default = hex(ord(self.default))
87
88 [python start generated code]*/
89 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
90
91 /* --- Globals ------------------------------------------------------------
92
93 NOTE: In the interpreter's initialization phase, some globals are currently
94 initialized dynamically as needed. In the process Unicode objects may
95 be created before the Unicode type is ready.
96
97 */
98
99
100 #ifdef __cplusplus
101 extern "C" {
102 #endif
103
104 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
105 // The value must be the same in fileutils.c.
106 #define MAX_UNICODE 0x10ffff
107
108 #ifdef Py_DEBUG
109 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
110 #else
111 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
112 #endif
113
114 #define _PyUnicode_UTF8(op) \
115 (_PyCompactUnicodeObject_CAST(op)->utf8)
116 #define PyUnicode_UTF8(op) \
117 (assert(_PyUnicode_CHECK(op)), \
118 assert(PyUnicode_IS_READY(op)), \
119 PyUnicode_IS_COMPACT_ASCII(op) ? \
120 ((char*)(_PyASCIIObject_CAST(op) + 1)) : \
121 _PyUnicode_UTF8(op))
122 #define _PyUnicode_UTF8_LENGTH(op) \
123 (_PyCompactUnicodeObject_CAST(op)->utf8_length)
124 #define PyUnicode_UTF8_LENGTH(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 assert(PyUnicode_IS_READY(op)), \
127 PyUnicode_IS_COMPACT_ASCII(op) ? \
128 _PyASCIIObject_CAST(op)->length : \
129 _PyUnicode_UTF8_LENGTH(op))
130 #define _PyUnicode_WSTR(op) \
131 (_PyASCIIObject_CAST(op)->wstr)
132
133 /* Don't use deprecated macro of unicodeobject.h */
134 #undef PyUnicode_WSTR_LENGTH
135 #define PyUnicode_WSTR_LENGTH(op) \
136 (PyUnicode_IS_COMPACT_ASCII(op) ? \
137 _PyASCIIObject_CAST(op)->length : \
138 _PyCompactUnicodeObject_CAST(op)->wstr_length)
139 #define _PyUnicode_WSTR_LENGTH(op) \
140 (_PyCompactUnicodeObject_CAST(op)->wstr_length)
141 #define _PyUnicode_LENGTH(op) \
142 (_PyASCIIObject_CAST(op)->length)
143 #define _PyUnicode_STATE(op) \
144 (_PyASCIIObject_CAST(op)->state)
145 #define _PyUnicode_HASH(op) \
146 (_PyASCIIObject_CAST(op)->hash)
147 #define _PyUnicode_KIND(op) \
148 (assert(_PyUnicode_CHECK(op)), \
149 _PyASCIIObject_CAST(op)->state.kind)
150 #define _PyUnicode_GET_LENGTH(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 _PyASCIIObject_CAST(op)->length)
153 #define _PyUnicode_DATA_ANY(op) \
154 (_PyUnicodeObject_CAST(op)->data.any)
155
156 #undef PyUnicode_READY
157 #define PyUnicode_READY(op) \
158 (assert(_PyUnicode_CHECK(op)), \
159 (PyUnicode_IS_READY(op) ? \
160 0 : \
161 _PyUnicode_Ready(op)))
162
163 #define _PyUnicode_SHARE_UTF8(op) \
164 (assert(_PyUnicode_CHECK(op)), \
165 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
166 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
167 #define _PyUnicode_SHARE_WSTR(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
170
171 /* true if the Unicode object has an allocated UTF-8 memory block
172 (not shared with other data) */
173 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
174 ((!PyUnicode_IS_COMPACT_ASCII(op) \
175 && _PyUnicode_UTF8(op) \
176 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
177
178 /* true if the Unicode object has an allocated wstr memory block
179 (not shared with other data) */
180 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
181 ((_PyUnicode_WSTR(op) && \
182 (!PyUnicode_IS_READY(op) || \
183 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
184
185 /* Generic helper macro to convert characters of different types.
186 from_type and to_type have to be valid type names, begin and end
187 are pointers to the source characters which should be of type
188 "from_type *". to is a pointer of type "to_type *" and points to the
189 buffer where the result characters are written to. */
190 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
191 do { \
192 to_type *_to = (to_type *)(to); \
193 const from_type *_iter = (const from_type *)(begin);\
194 const from_type *_end = (const from_type *)(end);\
195 Py_ssize_t n = (_end) - (_iter); \
196 const from_type *_unrolled_end = \
197 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
198 while (_iter < (_unrolled_end)) { \
199 _to[0] = (to_type) _iter[0]; \
200 _to[1] = (to_type) _iter[1]; \
201 _to[2] = (to_type) _iter[2]; \
202 _to[3] = (to_type) _iter[3]; \
203 _iter += 4; _to += 4; \
204 } \
205 while (_iter < (_end)) \
206 *_to++ = (to_type) *_iter++; \
207 } while (0)
208
209 #define LATIN1(ch) \
210 (ch < 128 \
211 ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \
212 : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128])
213
214 #ifdef MS_WINDOWS
215 /* On Windows, overallocate by 50% is the best factor */
216 # define OVERALLOCATE_FACTOR 2
217 #else
218 /* On Linux, overallocate by 25% is the best factor */
219 # define OVERALLOCATE_FACTOR 4
220 #endif
221
222 /* This dictionary holds all interned unicode strings. Note that references
223 to strings in this dictionary are *not* counted in the string's ob_refcnt.
224 When the interned string reaches a refcnt of 0 the string deallocation
225 function will delete the reference from this dictionary.
226
227 Another way to look at this is that to say that the actual reference
228 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
229 */
230 static PyObject *interned = NULL;
231
232 /* Forward declaration */
233 static inline int
234 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
235 static inline void
236 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
237 static PyObject *
238 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
239 const char *errors);
240 static PyObject *
241 unicode_decode_utf8(const char *s, Py_ssize_t size,
242 _Py_error_handler error_handler, const char *errors,
243 Py_ssize_t *consumed);
244 #ifdef Py_DEBUG
245 static inline int unicode_is_finalizing(void);
246 static int unicode_is_singleton(PyObject *unicode);
247 #endif
248
249
250 // Return a borrowed reference to the empty string singleton.
unicode_get_empty(void)251 static inline PyObject* unicode_get_empty(void)
252 {
253 _Py_DECLARE_STR(empty, "");
254 return &_Py_STR(empty);
255 }
256
257
258 // Return a strong reference to the empty string singleton.
unicode_new_empty(void)259 static inline PyObject* unicode_new_empty(void)
260 {
261 PyObject *empty = unicode_get_empty();
262 Py_INCREF(empty);
263 return empty;
264 }
265
266 #define _Py_RETURN_UNICODE_EMPTY() \
267 do { \
268 return unicode_new_empty(); \
269 } while (0)
270
271 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)272 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
273 Py_ssize_t start, Py_ssize_t length)
274 {
275 assert(0 <= start);
276 assert(kind != PyUnicode_WCHAR_KIND);
277 switch (kind) {
278 case PyUnicode_1BYTE_KIND: {
279 assert(value <= 0xff);
280 Py_UCS1 ch = (unsigned char)value;
281 Py_UCS1 *to = (Py_UCS1 *)data + start;
282 memset(to, ch, length);
283 break;
284 }
285 case PyUnicode_2BYTE_KIND: {
286 assert(value <= 0xffff);
287 Py_UCS2 ch = (Py_UCS2)value;
288 Py_UCS2 *to = (Py_UCS2 *)data + start;
289 const Py_UCS2 *end = to + length;
290 for (; to < end; ++to) *to = ch;
291 break;
292 }
293 case PyUnicode_4BYTE_KIND: {
294 assert(value <= MAX_UNICODE);
295 Py_UCS4 ch = value;
296 Py_UCS4 * to = (Py_UCS4 *)data + start;
297 const Py_UCS4 *end = to + length;
298 for (; to < end; ++to) *to = ch;
299 break;
300 }
301 default: Py_UNREACHABLE();
302 }
303 }
304
305
306 /* Fast detection of the most frequent whitespace characters */
307 const unsigned char _Py_ascii_whitespace[] = {
308 0, 0, 0, 0, 0, 0, 0, 0,
309 /* case 0x0009: * CHARACTER TABULATION */
310 /* case 0x000A: * LINE FEED */
311 /* case 0x000B: * LINE TABULATION */
312 /* case 0x000C: * FORM FEED */
313 /* case 0x000D: * CARRIAGE RETURN */
314 0, 1, 1, 1, 1, 1, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 /* case 0x001C: * FILE SEPARATOR */
317 /* case 0x001D: * GROUP SEPARATOR */
318 /* case 0x001E: * RECORD SEPARATOR */
319 /* case 0x001F: * UNIT SEPARATOR */
320 0, 0, 0, 0, 1, 1, 1, 1,
321 /* case 0x0020: * SPACE */
322 1, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0,
324 0, 0, 0, 0, 0, 0, 0, 0,
325 0, 0, 0, 0, 0, 0, 0, 0,
326
327 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0,
329 0, 0, 0, 0, 0, 0, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 0, 0, 0, 0, 0, 0, 0, 0
335 };
336
337 /* forward */
338 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
339 static PyObject* get_latin1_char(unsigned char ch);
340 static int unicode_modifiable(PyObject *unicode);
341
342
343 static PyObject *
344 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
345 static PyObject *
346 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
347 static PyObject *
348 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
349
350 static PyObject *
351 unicode_encode_call_errorhandler(const char *errors,
352 PyObject **errorHandler,const char *encoding, const char *reason,
353 PyObject *unicode, PyObject **exceptionObject,
354 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
355
356 static void
357 raise_encode_exception(PyObject **exceptionObject,
358 const char *encoding,
359 PyObject *unicode,
360 Py_ssize_t startpos, Py_ssize_t endpos,
361 const char *reason);
362
363 /* Same for linebreaks */
364 static const unsigned char ascii_linebreak[] = {
365 0, 0, 0, 0, 0, 0, 0, 0,
366 /* 0x000A, * LINE FEED */
367 /* 0x000B, * LINE TABULATION */
368 /* 0x000C, * FORM FEED */
369 /* 0x000D, * CARRIAGE RETURN */
370 0, 0, 1, 1, 1, 1, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 /* 0x001C, * FILE SEPARATOR */
373 /* 0x001D, * GROUP SEPARATOR */
374 /* 0x001E, * RECORD SEPARATOR */
375 0, 0, 0, 0, 1, 1, 1, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0,
378 0, 0, 0, 0, 0, 0, 0, 0,
379 0, 0, 0, 0, 0, 0, 0, 0,
380
381 0, 0, 0, 0, 0, 0, 0, 0,
382 0, 0, 0, 0, 0, 0, 0, 0,
383 0, 0, 0, 0, 0, 0, 0, 0,
384 0, 0, 0, 0, 0, 0, 0, 0,
385 0, 0, 0, 0, 0, 0, 0, 0,
386 0, 0, 0, 0, 0, 0, 0, 0,
387 0, 0, 0, 0, 0, 0, 0, 0,
388 0, 0, 0, 0, 0, 0, 0, 0
389 };
390
391 static int convert_uc(PyObject *obj, void *addr);
392
393 struct encoding_map;
394 #include "clinic/unicodeobject.c.h"
395
396 _Py_error_handler
_Py_GetErrorHandler(const char * errors)397 _Py_GetErrorHandler(const char *errors)
398 {
399 if (errors == NULL || strcmp(errors, "strict") == 0) {
400 return _Py_ERROR_STRICT;
401 }
402 if (strcmp(errors, "surrogateescape") == 0) {
403 return _Py_ERROR_SURROGATEESCAPE;
404 }
405 if (strcmp(errors, "replace") == 0) {
406 return _Py_ERROR_REPLACE;
407 }
408 if (strcmp(errors, "ignore") == 0) {
409 return _Py_ERROR_IGNORE;
410 }
411 if (strcmp(errors, "backslashreplace") == 0) {
412 return _Py_ERROR_BACKSLASHREPLACE;
413 }
414 if (strcmp(errors, "surrogatepass") == 0) {
415 return _Py_ERROR_SURROGATEPASS;
416 }
417 if (strcmp(errors, "xmlcharrefreplace") == 0) {
418 return _Py_ERROR_XMLCHARREFREPLACE;
419 }
420 return _Py_ERROR_OTHER;
421 }
422
423
424 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)425 get_error_handler_wide(const wchar_t *errors)
426 {
427 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
428 return _Py_ERROR_STRICT;
429 }
430 if (wcscmp(errors, L"surrogateescape") == 0) {
431 return _Py_ERROR_SURROGATEESCAPE;
432 }
433 if (wcscmp(errors, L"replace") == 0) {
434 return _Py_ERROR_REPLACE;
435 }
436 if (wcscmp(errors, L"ignore") == 0) {
437 return _Py_ERROR_IGNORE;
438 }
439 if (wcscmp(errors, L"backslashreplace") == 0) {
440 return _Py_ERROR_BACKSLASHREPLACE;
441 }
442 if (wcscmp(errors, L"surrogatepass") == 0) {
443 return _Py_ERROR_SURROGATEPASS;
444 }
445 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
446 return _Py_ERROR_XMLCHARREFREPLACE;
447 }
448 return _Py_ERROR_OTHER;
449 }
450
451
452 static inline int
unicode_check_encoding_errors(const char * encoding,const char * errors)453 unicode_check_encoding_errors(const char *encoding, const char *errors)
454 {
455 if (encoding == NULL && errors == NULL) {
456 return 0;
457 }
458
459 PyInterpreterState *interp = _PyInterpreterState_GET();
460 #ifndef Py_DEBUG
461 /* In release mode, only check in development mode (-X dev) */
462 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
463 return 0;
464 }
465 #else
466 /* Always check in debug mode */
467 #endif
468
469 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
470 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
471 if (!interp->unicode.fs_codec.encoding) {
472 return 0;
473 }
474
475 /* Disable checks during Python finalization. For example, it allows to
476 call _PyObject_Dump() during finalization for debugging purpose. */
477 if (interp->finalizing) {
478 return 0;
479 }
480
481 if (encoding != NULL) {
482 PyObject *handler = _PyCodec_Lookup(encoding);
483 if (handler == NULL) {
484 return -1;
485 }
486 Py_DECREF(handler);
487 }
488
489 if (errors != NULL) {
490 PyObject *handler = PyCodec_LookupError(errors);
491 if (handler == NULL) {
492 return -1;
493 }
494 Py_DECREF(handler);
495 }
496 return 0;
497 }
498
499
500 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)501 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
502 {
503 #define CHECK(expr) \
504 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
505
506 assert(op != NULL);
507 CHECK(PyUnicode_Check(op));
508
509 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
510 unsigned int kind = ascii->state.kind;
511
512 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
513 CHECK(kind == PyUnicode_1BYTE_KIND);
514 CHECK(ascii->state.ready == 1);
515 }
516 else {
517 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
518 void *data;
519
520 if (ascii->state.compact == 1) {
521 data = compact + 1;
522 CHECK(kind == PyUnicode_1BYTE_KIND
523 || kind == PyUnicode_2BYTE_KIND
524 || kind == PyUnicode_4BYTE_KIND);
525 CHECK(ascii->state.ascii == 0);
526 CHECK(ascii->state.ready == 1);
527 CHECK(compact->utf8 != data);
528 }
529 else {
530 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
531
532 data = unicode->data.any;
533 if (kind == PyUnicode_WCHAR_KIND) {
534 CHECK(ascii->length == 0);
535 CHECK(ascii->hash == -1);
536 CHECK(ascii->state.compact == 0);
537 CHECK(ascii->state.ascii == 0);
538 CHECK(ascii->state.ready == 0);
539 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
540 CHECK(ascii->wstr != NULL);
541 CHECK(data == NULL);
542 CHECK(compact->utf8 == NULL);
543 }
544 else {
545 CHECK(kind == PyUnicode_1BYTE_KIND
546 || kind == PyUnicode_2BYTE_KIND
547 || kind == PyUnicode_4BYTE_KIND);
548 CHECK(ascii->state.compact == 0);
549 CHECK(ascii->state.ready == 1);
550 CHECK(data != NULL);
551 if (ascii->state.ascii) {
552 CHECK(compact->utf8 == data);
553 CHECK(compact->utf8_length == ascii->length);
554 }
555 else
556 CHECK(compact->utf8 != data);
557 }
558 }
559 if (kind != PyUnicode_WCHAR_KIND) {
560 if (
561 #if SIZEOF_WCHAR_T == 2
562 kind == PyUnicode_2BYTE_KIND
563 #else
564 kind == PyUnicode_4BYTE_KIND
565 #endif
566 )
567 {
568 CHECK(ascii->wstr == data);
569 CHECK(compact->wstr_length == ascii->length);
570 } else
571 CHECK(ascii->wstr != data);
572 }
573
574 if (compact->utf8 == NULL)
575 CHECK(compact->utf8_length == 0);
576 if (ascii->wstr == NULL)
577 CHECK(compact->wstr_length == 0);
578 }
579
580 /* check that the best kind is used: O(n) operation */
581 if (check_content && kind != PyUnicode_WCHAR_KIND) {
582 Py_ssize_t i;
583 Py_UCS4 maxchar = 0;
584 const void *data;
585 Py_UCS4 ch;
586
587 data = PyUnicode_DATA(ascii);
588 for (i=0; i < ascii->length; i++)
589 {
590 ch = PyUnicode_READ(kind, data, i);
591 if (ch > maxchar)
592 maxchar = ch;
593 }
594 if (kind == PyUnicode_1BYTE_KIND) {
595 if (ascii->state.ascii == 0) {
596 CHECK(maxchar >= 128);
597 CHECK(maxchar <= 255);
598 }
599 else
600 CHECK(maxchar < 128);
601 }
602 else if (kind == PyUnicode_2BYTE_KIND) {
603 CHECK(maxchar >= 0x100);
604 CHECK(maxchar <= 0xFFFF);
605 }
606 else {
607 CHECK(maxchar >= 0x10000);
608 CHECK(maxchar <= MAX_UNICODE);
609 }
610 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
611 }
612 return 1;
613
614 #undef CHECK
615 }
616
617
618 static PyObject*
unicode_result_wchar(PyObject * unicode)619 unicode_result_wchar(PyObject *unicode)
620 {
621 #ifndef Py_DEBUG
622 Py_ssize_t len;
623
624 len = _PyUnicode_WSTR_LENGTH(unicode);
625 if (len == 0) {
626 Py_DECREF(unicode);
627 _Py_RETURN_UNICODE_EMPTY();
628 }
629
630 if (len == 1) {
631 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
632 if ((Py_UCS4)ch < 256) {
633 Py_DECREF(unicode);
634 return get_latin1_char((unsigned char)ch);
635 }
636 }
637
638 if (_PyUnicode_Ready(unicode) < 0) {
639 Py_DECREF(unicode);
640 return NULL;
641 }
642 #else
643 assert(Py_REFCNT(unicode) == 1);
644
645 /* don't make the result ready in debug mode to ensure that the caller
646 makes the string ready before using it */
647 assert(_PyUnicode_CheckConsistency(unicode, 1));
648 #endif
649 return unicode;
650 }
651
652 static PyObject*
unicode_result_ready(PyObject * unicode)653 unicode_result_ready(PyObject *unicode)
654 {
655 Py_ssize_t length;
656
657 length = PyUnicode_GET_LENGTH(unicode);
658 if (length == 0) {
659 PyObject *empty = unicode_get_empty();
660 if (unicode != empty) {
661 Py_DECREF(unicode);
662 Py_INCREF(empty);
663 }
664 return empty;
665 }
666
667 if (length == 1) {
668 int kind = PyUnicode_KIND(unicode);
669 if (kind == PyUnicode_1BYTE_KIND) {
670 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
671 Py_UCS1 ch = data[0];
672 PyObject *latin1_char = LATIN1(ch);
673 if (unicode != latin1_char) {
674 Py_INCREF(latin1_char);
675 Py_DECREF(unicode);
676 }
677 return latin1_char;
678 }
679 }
680
681 assert(_PyUnicode_CheckConsistency(unicode, 1));
682 return unicode;
683 }
684
685 static PyObject*
unicode_result(PyObject * unicode)686 unicode_result(PyObject *unicode)
687 {
688 assert(_PyUnicode_CHECK(unicode));
689 if (PyUnicode_IS_READY(unicode))
690 return unicode_result_ready(unicode);
691 else
692 return unicode_result_wchar(unicode);
693 }
694
695 static PyObject*
unicode_result_unchanged(PyObject * unicode)696 unicode_result_unchanged(PyObject *unicode)
697 {
698 if (PyUnicode_CheckExact(unicode)) {
699 if (PyUnicode_READY(unicode) == -1)
700 return NULL;
701 Py_INCREF(unicode);
702 return unicode;
703 }
704 else
705 /* Subtype -- return genuine unicode string with the same value. */
706 return _PyUnicode_Copy(unicode);
707 }
708
709 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
710 ASCII, Latin1, UTF-8, etc. */
711 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)712 backslashreplace(_PyBytesWriter *writer, char *str,
713 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
714 {
715 Py_ssize_t size, i;
716 Py_UCS4 ch;
717 enum PyUnicode_Kind kind;
718 const void *data;
719
720 assert(PyUnicode_IS_READY(unicode));
721 kind = PyUnicode_KIND(unicode);
722 data = PyUnicode_DATA(unicode);
723
724 size = 0;
725 /* determine replacement size */
726 for (i = collstart; i < collend; ++i) {
727 Py_ssize_t incr;
728
729 ch = PyUnicode_READ(kind, data, i);
730 if (ch < 0x100)
731 incr = 2+2;
732 else if (ch < 0x10000)
733 incr = 2+4;
734 else {
735 assert(ch <= MAX_UNICODE);
736 incr = 2+8;
737 }
738 if (size > PY_SSIZE_T_MAX - incr) {
739 PyErr_SetString(PyExc_OverflowError,
740 "encoded result is too long for a Python string");
741 return NULL;
742 }
743 size += incr;
744 }
745
746 str = _PyBytesWriter_Prepare(writer, str, size);
747 if (str == NULL)
748 return NULL;
749
750 /* generate replacement */
751 for (i = collstart; i < collend; ++i) {
752 ch = PyUnicode_READ(kind, data, i);
753 *str++ = '\\';
754 if (ch >= 0x00010000) {
755 *str++ = 'U';
756 *str++ = Py_hexdigits[(ch>>28)&0xf];
757 *str++ = Py_hexdigits[(ch>>24)&0xf];
758 *str++ = Py_hexdigits[(ch>>20)&0xf];
759 *str++ = Py_hexdigits[(ch>>16)&0xf];
760 *str++ = Py_hexdigits[(ch>>12)&0xf];
761 *str++ = Py_hexdigits[(ch>>8)&0xf];
762 }
763 else if (ch >= 0x100) {
764 *str++ = 'u';
765 *str++ = Py_hexdigits[(ch>>12)&0xf];
766 *str++ = Py_hexdigits[(ch>>8)&0xf];
767 }
768 else
769 *str++ = 'x';
770 *str++ = Py_hexdigits[(ch>>4)&0xf];
771 *str++ = Py_hexdigits[ch&0xf];
772 }
773 return str;
774 }
775
776 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
777 ASCII, Latin1, UTF-8, etc. */
778 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)779 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
780 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
781 {
782 Py_ssize_t size, i;
783 Py_UCS4 ch;
784 enum PyUnicode_Kind kind;
785 const void *data;
786
787 assert(PyUnicode_IS_READY(unicode));
788 kind = PyUnicode_KIND(unicode);
789 data = PyUnicode_DATA(unicode);
790
791 size = 0;
792 /* determine replacement size */
793 for (i = collstart; i < collend; ++i) {
794 Py_ssize_t incr;
795
796 ch = PyUnicode_READ(kind, data, i);
797 if (ch < 10)
798 incr = 2+1+1;
799 else if (ch < 100)
800 incr = 2+2+1;
801 else if (ch < 1000)
802 incr = 2+3+1;
803 else if (ch < 10000)
804 incr = 2+4+1;
805 else if (ch < 100000)
806 incr = 2+5+1;
807 else if (ch < 1000000)
808 incr = 2+6+1;
809 else {
810 assert(ch <= MAX_UNICODE);
811 incr = 2+7+1;
812 }
813 if (size > PY_SSIZE_T_MAX - incr) {
814 PyErr_SetString(PyExc_OverflowError,
815 "encoded result is too long for a Python string");
816 return NULL;
817 }
818 size += incr;
819 }
820
821 str = _PyBytesWriter_Prepare(writer, str, size);
822 if (str == NULL)
823 return NULL;
824
825 /* generate replacement */
826 for (i = collstart; i < collend; ++i) {
827 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
828 if (size < 0) {
829 return NULL;
830 }
831 str += size;
832 }
833 return str;
834 }
835
836 /* --- Bloom Filters ----------------------------------------------------- */
837
838 /* stuff to implement simple "bloom filters" for Unicode characters.
839 to keep things simple, we use a single bitmask, using the least 5
840 bits from each unicode characters as the bit index. */
841
842 /* the linebreak mask is set up by _PyUnicode_Init() below */
843
844 #if LONG_BIT >= 128
845 #define BLOOM_WIDTH 128
846 #elif LONG_BIT >= 64
847 #define BLOOM_WIDTH 64
848 #elif LONG_BIT >= 32
849 #define BLOOM_WIDTH 32
850 #else
851 #error "LONG_BIT is smaller than 32"
852 #endif
853
854 #define BLOOM_MASK unsigned long
855
856 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
857
858 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
859
860 #define BLOOM_LINEBREAK(ch) \
861 ((ch) < 128U ? ascii_linebreak[(ch)] : \
862 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
863
864 static inline BLOOM_MASK
make_bloom_mask(int kind,const void * ptr,Py_ssize_t len)865 make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
866 {
867 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
868 do { \
869 TYPE *data = (TYPE *)PTR; \
870 TYPE *end = data + LEN; \
871 Py_UCS4 ch; \
872 for (; data != end; data++) { \
873 ch = *data; \
874 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
875 } \
876 break; \
877 } while (0)
878
879 /* calculate simple bloom-style bitmask for a given unicode string */
880
881 BLOOM_MASK mask;
882
883 mask = 0;
884 switch (kind) {
885 case PyUnicode_1BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
887 break;
888 case PyUnicode_2BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
890 break;
891 case PyUnicode_4BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
893 break;
894 default:
895 Py_UNREACHABLE();
896 }
897 return mask;
898
899 #undef BLOOM_UPDATE
900 }
901
902 static int
ensure_unicode(PyObject * obj)903 ensure_unicode(PyObject *obj)
904 {
905 if (!PyUnicode_Check(obj)) {
906 PyErr_Format(PyExc_TypeError,
907 "must be str, not %.100s",
908 Py_TYPE(obj)->tp_name);
909 return -1;
910 }
911 return PyUnicode_READY(obj);
912 }
913
914 /* Compilation of templated routines */
915
916 #define STRINGLIB_GET_EMPTY() unicode_get_empty()
917
918 #include "stringlib/asciilib.h"
919 #include "stringlib/fastsearch.h"
920 #include "stringlib/partition.h"
921 #include "stringlib/split.h"
922 #include "stringlib/count.h"
923 #include "stringlib/find.h"
924 #include "stringlib/find_max_char.h"
925 #include "stringlib/undef.h"
926
927 #include "stringlib/ucs1lib.h"
928 #include "stringlib/fastsearch.h"
929 #include "stringlib/partition.h"
930 #include "stringlib/split.h"
931 #include "stringlib/count.h"
932 #include "stringlib/find.h"
933 #include "stringlib/replace.h"
934 #include "stringlib/find_max_char.h"
935 #include "stringlib/undef.h"
936
937 #include "stringlib/ucs2lib.h"
938 #include "stringlib/fastsearch.h"
939 #include "stringlib/partition.h"
940 #include "stringlib/split.h"
941 #include "stringlib/count.h"
942 #include "stringlib/find.h"
943 #include "stringlib/replace.h"
944 #include "stringlib/find_max_char.h"
945 #include "stringlib/undef.h"
946
947 #include "stringlib/ucs4lib.h"
948 #include "stringlib/fastsearch.h"
949 #include "stringlib/partition.h"
950 #include "stringlib/split.h"
951 #include "stringlib/count.h"
952 #include "stringlib/find.h"
953 #include "stringlib/replace.h"
954 #include "stringlib/find_max_char.h"
955 #include "stringlib/undef.h"
956
957 _Py_COMP_DIAG_PUSH
958 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
959 #include "stringlib/unicodedefs.h"
960 #include "stringlib/fastsearch.h"
961 #include "stringlib/count.h"
962 #include "stringlib/find.h"
963 #include "stringlib/undef.h"
964 _Py_COMP_DIAG_POP
965
966 #undef STRINGLIB_GET_EMPTY
967
968 /* --- Unicode Object ----------------------------------------------------- */
969
970 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)971 findchar(const void *s, int kind,
972 Py_ssize_t size, Py_UCS4 ch,
973 int direction)
974 {
975 switch (kind) {
976 case PyUnicode_1BYTE_KIND:
977 if ((Py_UCS1) ch != ch)
978 return -1;
979 if (direction > 0)
980 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
981 else
982 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
983 case PyUnicode_2BYTE_KIND:
984 if ((Py_UCS2) ch != ch)
985 return -1;
986 if (direction > 0)
987 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
988 else
989 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
990 case PyUnicode_4BYTE_KIND:
991 if (direction > 0)
992 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
993 else
994 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
995 default:
996 Py_UNREACHABLE();
997 }
998 }
999
1000 #ifdef Py_DEBUG
1001 /* Fill the data of a Unicode string with invalid characters to detect bugs
1002 earlier.
1003
1004 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1005 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1006 invalid character in Unicode 6.0. */
1007 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)1008 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1009 {
1010 int kind = PyUnicode_KIND(unicode);
1011 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1012 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1013 if (length <= old_length)
1014 return;
1015 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1016 }
1017 #endif
1018
1019 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)1020 resize_compact(PyObject *unicode, Py_ssize_t length)
1021 {
1022 Py_ssize_t char_size;
1023 Py_ssize_t struct_size;
1024 Py_ssize_t new_size;
1025 int share_wstr;
1026 PyObject *new_unicode;
1027 #ifdef Py_DEBUG
1028 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1029 #endif
1030
1031 assert(unicode_modifiable(unicode));
1032 assert(PyUnicode_IS_READY(unicode));
1033 assert(PyUnicode_IS_COMPACT(unicode));
1034
1035 char_size = PyUnicode_KIND(unicode);
1036 if (PyUnicode_IS_ASCII(unicode))
1037 struct_size = sizeof(PyASCIIObject);
1038 else
1039 struct_size = sizeof(PyCompactUnicodeObject);
1040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041
1042 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1043 PyErr_NoMemory();
1044 return NULL;
1045 }
1046 new_size = (struct_size + (length + 1) * char_size);
1047
1048 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1049 PyObject_Free(_PyUnicode_UTF8(unicode));
1050 _PyUnicode_UTF8(unicode) = NULL;
1051 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1052 }
1053 #ifdef Py_REF_DEBUG
1054 _Py_RefTotal--;
1055 #endif
1056 #ifdef Py_TRACE_REFS
1057 _Py_ForgetReference(unicode);
1058 #endif
1059
1060 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1061 if (new_unicode == NULL) {
1062 _Py_NewReference(unicode);
1063 PyErr_NoMemory();
1064 return NULL;
1065 }
1066 unicode = new_unicode;
1067 _Py_NewReference(unicode);
1068
1069 _PyUnicode_LENGTH(unicode) = length;
1070 if (share_wstr) {
1071 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = length;
1074 }
1075 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1076 PyObject_Free(_PyUnicode_WSTR(unicode));
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 if (!PyUnicode_IS_ASCII(unicode))
1079 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1080 }
1081 #ifdef Py_DEBUG
1082 unicode_fill_invalid(unicode, old_length);
1083 #endif
1084 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1085 length, 0);
1086 assert(_PyUnicode_CheckConsistency(unicode, 0));
1087 return unicode;
1088 }
1089
1090 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1091 resize_inplace(PyObject *unicode, Py_ssize_t length)
1092 {
1093 wchar_t *wstr;
1094 Py_ssize_t new_size;
1095 assert(!PyUnicode_IS_COMPACT(unicode));
1096 assert(Py_REFCNT(unicode) == 1);
1097
1098 if (PyUnicode_IS_READY(unicode)) {
1099 Py_ssize_t char_size;
1100 int share_wstr, share_utf8;
1101 void *data;
1102 #ifdef Py_DEBUG
1103 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1104 #endif
1105
1106 data = _PyUnicode_DATA_ANY(unicode);
1107 char_size = PyUnicode_KIND(unicode);
1108 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1109 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1110
1111 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1112 PyErr_NoMemory();
1113 return -1;
1114 }
1115 new_size = (length + 1) * char_size;
1116
1117 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1118 {
1119 PyObject_Free(_PyUnicode_UTF8(unicode));
1120 _PyUnicode_UTF8(unicode) = NULL;
1121 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1122 }
1123
1124 data = (PyObject *)PyObject_Realloc(data, new_size);
1125 if (data == NULL) {
1126 PyErr_NoMemory();
1127 return -1;
1128 }
1129 _PyUnicode_DATA_ANY(unicode) = data;
1130 if (share_wstr) {
1131 _PyUnicode_WSTR(unicode) = data;
1132 _PyUnicode_WSTR_LENGTH(unicode) = length;
1133 }
1134 if (share_utf8) {
1135 _PyUnicode_UTF8(unicode) = data;
1136 _PyUnicode_UTF8_LENGTH(unicode) = length;
1137 }
1138 _PyUnicode_LENGTH(unicode) = length;
1139 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1140 #ifdef Py_DEBUG
1141 unicode_fill_invalid(unicode, old_length);
1142 #endif
1143 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1144 assert(_PyUnicode_CheckConsistency(unicode, 0));
1145 return 0;
1146 }
1147 }
1148 assert(_PyUnicode_WSTR(unicode) != NULL);
1149
1150 /* check for integer overflow */
1151 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1152 PyErr_NoMemory();
1153 return -1;
1154 }
1155 new_size = sizeof(wchar_t) * (length + 1);
1156 wstr = _PyUnicode_WSTR(unicode);
1157 wstr = PyObject_Realloc(wstr, new_size);
1158 if (!wstr) {
1159 PyErr_NoMemory();
1160 return -1;
1161 }
1162 _PyUnicode_WSTR(unicode) = wstr;
1163 _PyUnicode_WSTR(unicode)[length] = 0;
1164 _PyUnicode_WSTR_LENGTH(unicode) = length;
1165 assert(_PyUnicode_CheckConsistency(unicode, 0));
1166 return 0;
1167 }
1168
1169 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1170 resize_copy(PyObject *unicode, Py_ssize_t length)
1171 {
1172 Py_ssize_t copy_length;
1173 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1174 PyObject *copy;
1175
1176 assert(PyUnicode_IS_READY(unicode));
1177
1178 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1179 if (copy == NULL)
1180 return NULL;
1181
1182 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1183 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1184 return copy;
1185 }
1186 else {
1187 PyObject *w;
1188
1189 w = (PyObject*)_PyUnicode_New(length);
1190 if (w == NULL)
1191 return NULL;
1192 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1193 copy_length = Py_MIN(copy_length, length);
1194 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1195 copy_length * sizeof(wchar_t));
1196 return w;
1197 }
1198 }
1199
1200 /* We allocate one more byte to make sure the string is
1201 Ux0000 terminated; some code (e.g. new_identifier)
1202 relies on that.
1203
1204 XXX This allocator could further be enhanced by assuring that the
1205 free list never reduces its size below 1.
1206
1207 */
1208
1209 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1210 _PyUnicode_New(Py_ssize_t length)
1211 {
1212 PyUnicodeObject *unicode;
1213 size_t new_size;
1214
1215 /* Optimization for empty strings */
1216 if (length == 0) {
1217 return (PyUnicodeObject *)unicode_new_empty();
1218 }
1219
1220 /* Ensure we won't overflow the size. */
1221 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1222 return (PyUnicodeObject *)PyErr_NoMemory();
1223 }
1224 if (length < 0) {
1225 PyErr_SetString(PyExc_SystemError,
1226 "Negative size passed to _PyUnicode_New");
1227 return NULL;
1228 }
1229
1230 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1231 if (unicode == NULL)
1232 return NULL;
1233 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1234
1235 _PyUnicode_WSTR_LENGTH(unicode) = length;
1236 _PyUnicode_HASH(unicode) = -1;
1237 _PyUnicode_STATE(unicode).interned = 0;
1238 _PyUnicode_STATE(unicode).kind = 0;
1239 _PyUnicode_STATE(unicode).compact = 0;
1240 _PyUnicode_STATE(unicode).ready = 0;
1241 _PyUnicode_STATE(unicode).ascii = 0;
1242 _PyUnicode_DATA_ANY(unicode) = NULL;
1243 _PyUnicode_LENGTH(unicode) = 0;
1244 _PyUnicode_UTF8(unicode) = NULL;
1245 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1246
1247 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
1248 if (!_PyUnicode_WSTR(unicode)) {
1249 Py_DECREF(unicode);
1250 PyErr_NoMemory();
1251 return NULL;
1252 }
1253
1254 /* Initialize the first element to guard against cases where
1255 * the caller fails before initializing str -- unicode_resize()
1256 * reads str[0], and the Keep-Alive optimization can keep memory
1257 * allocated for str alive across a call to unicode_dealloc(unicode).
1258 * We don't want unicode_resize to read uninitialized memory in
1259 * that case.
1260 */
1261 _PyUnicode_WSTR(unicode)[0] = 0;
1262 _PyUnicode_WSTR(unicode)[length] = 0;
1263
1264 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1265 return unicode;
1266 }
1267
1268 static const char*
unicode_kind_name(PyObject * unicode)1269 unicode_kind_name(PyObject *unicode)
1270 {
1271 /* don't check consistency: unicode_kind_name() is called from
1272 _PyUnicode_Dump() */
1273 if (!PyUnicode_IS_COMPACT(unicode))
1274 {
1275 if (!PyUnicode_IS_READY(unicode))
1276 return "wstr";
1277 switch (PyUnicode_KIND(unicode))
1278 {
1279 case PyUnicode_1BYTE_KIND:
1280 if (PyUnicode_IS_ASCII(unicode))
1281 return "legacy ascii";
1282 else
1283 return "legacy latin1";
1284 case PyUnicode_2BYTE_KIND:
1285 return "legacy UCS2";
1286 case PyUnicode_4BYTE_KIND:
1287 return "legacy UCS4";
1288 default:
1289 return "<legacy invalid kind>";
1290 }
1291 }
1292 assert(PyUnicode_IS_READY(unicode));
1293 switch (PyUnicode_KIND(unicode)) {
1294 case PyUnicode_1BYTE_KIND:
1295 if (PyUnicode_IS_ASCII(unicode))
1296 return "ascii";
1297 else
1298 return "latin1";
1299 case PyUnicode_2BYTE_KIND:
1300 return "UCS2";
1301 case PyUnicode_4BYTE_KIND:
1302 return "UCS4";
1303 default:
1304 return "<invalid compact kind>";
1305 }
1306 }
1307
1308 #ifdef Py_DEBUG
1309 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1310 const char *_PyUnicode_utf8(void *unicode_raw){
1311 PyObject *unicode = _PyObject_CAST(unicode_raw);
1312 return PyUnicode_UTF8(unicode);
1313 }
1314
_PyUnicode_compact_data(void * unicode_raw)1315 const void *_PyUnicode_compact_data(void *unicode_raw) {
1316 PyObject *unicode = _PyObject_CAST(unicode_raw);
1317 return _PyUnicode_COMPACT_DATA(unicode);
1318 }
_PyUnicode_data(void * unicode_raw)1319 const void *_PyUnicode_data(void *unicode_raw) {
1320 PyObject *unicode = _PyObject_CAST(unicode_raw);
1321 printf("obj %p\n", (void*)unicode);
1322 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1323 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1324 printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1325 printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1326 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1327 return PyUnicode_DATA(unicode);
1328 }
1329
1330 void
_PyUnicode_Dump(PyObject * op)1331 _PyUnicode_Dump(PyObject *op)
1332 {
1333 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1334 PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1335 PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1336 const void *data;
1337
1338 if (ascii->state.compact)
1339 {
1340 if (ascii->state.ascii)
1341 data = (ascii + 1);
1342 else
1343 data = (compact + 1);
1344 }
1345 else
1346 data = unicode->data.any;
1347 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1348
1349 if (ascii->wstr == data)
1350 printf("shared ");
1351 printf("wstr=%p", (void *)ascii->wstr);
1352
1353 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1354 printf(" (%zu), ", compact->wstr_length);
1355 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
1356 printf("shared ");
1357 }
1358 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1359 }
1360 printf(", data=%p\n", data);
1361 }
1362 #endif
1363
1364
1365 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1366 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1367 {
1368 /* Optimization for empty strings */
1369 if (size == 0) {
1370 return unicode_new_empty();
1371 }
1372
1373 PyObject *obj;
1374 PyCompactUnicodeObject *unicode;
1375 void *data;
1376 enum PyUnicode_Kind kind;
1377 int is_sharing, is_ascii;
1378 Py_ssize_t char_size;
1379 Py_ssize_t struct_size;
1380
1381 is_ascii = 0;
1382 is_sharing = 0;
1383 struct_size = sizeof(PyCompactUnicodeObject);
1384 if (maxchar < 128) {
1385 kind = PyUnicode_1BYTE_KIND;
1386 char_size = 1;
1387 is_ascii = 1;
1388 struct_size = sizeof(PyASCIIObject);
1389 }
1390 else if (maxchar < 256) {
1391 kind = PyUnicode_1BYTE_KIND;
1392 char_size = 1;
1393 }
1394 else if (maxchar < 65536) {
1395 kind = PyUnicode_2BYTE_KIND;
1396 char_size = 2;
1397 if (sizeof(wchar_t) == 2)
1398 is_sharing = 1;
1399 }
1400 else {
1401 if (maxchar > MAX_UNICODE) {
1402 PyErr_SetString(PyExc_SystemError,
1403 "invalid maximum character passed to PyUnicode_New");
1404 return NULL;
1405 }
1406 kind = PyUnicode_4BYTE_KIND;
1407 char_size = 4;
1408 if (sizeof(wchar_t) == 4)
1409 is_sharing = 1;
1410 }
1411
1412 /* Ensure we won't overflow the size. */
1413 if (size < 0) {
1414 PyErr_SetString(PyExc_SystemError,
1415 "Negative size passed to PyUnicode_New");
1416 return NULL;
1417 }
1418 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1419 return PyErr_NoMemory();
1420
1421 /* Duplicated allocation code from _PyObject_New() instead of a call to
1422 * PyObject_New() so we are able to allocate space for the object and
1423 * it's data buffer.
1424 */
1425 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1426 if (obj == NULL) {
1427 return PyErr_NoMemory();
1428 }
1429 _PyObject_Init(obj, &PyUnicode_Type);
1430
1431 unicode = (PyCompactUnicodeObject *)obj;
1432 if (is_ascii)
1433 data = ((PyASCIIObject*)obj) + 1;
1434 else
1435 data = unicode + 1;
1436 _PyUnicode_LENGTH(unicode) = size;
1437 _PyUnicode_HASH(unicode) = -1;
1438 _PyUnicode_STATE(unicode).interned = 0;
1439 _PyUnicode_STATE(unicode).kind = kind;
1440 _PyUnicode_STATE(unicode).compact = 1;
1441 _PyUnicode_STATE(unicode).ready = 1;
1442 _PyUnicode_STATE(unicode).ascii = is_ascii;
1443 if (is_ascii) {
1444 ((char*)data)[size] = 0;
1445 _PyUnicode_WSTR(unicode) = NULL;
1446 }
1447 else if (kind == PyUnicode_1BYTE_KIND) {
1448 ((char*)data)[size] = 0;
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 unicode->utf8 = NULL;
1452 unicode->utf8_length = 0;
1453 }
1454 else {
1455 unicode->utf8 = NULL;
1456 unicode->utf8_length = 0;
1457 if (kind == PyUnicode_2BYTE_KIND)
1458 ((Py_UCS2*)data)[size] = 0;
1459 else /* kind == PyUnicode_4BYTE_KIND */
1460 ((Py_UCS4*)data)[size] = 0;
1461 if (is_sharing) {
1462 _PyUnicode_WSTR_LENGTH(unicode) = size;
1463 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1464 }
1465 else {
1466 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
1469 }
1470 #ifdef Py_DEBUG
1471 unicode_fill_invalid((PyObject*)unicode, 0);
1472 #endif
1473 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1474 return obj;
1475 }
1476
1477 #if SIZEOF_WCHAR_T == 2
1478 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1479 will decode surrogate pairs, the other conversions are implemented as macros
1480 for efficiency.
1481
1482 This function assumes that unicode can hold one more code point than wstr
1483 characters for a terminating null character. */
1484 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1485 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1486 PyObject *unicode)
1487 {
1488 const wchar_t *iter;
1489 Py_UCS4 *ucs4_out;
1490
1491 assert(unicode != NULL);
1492 assert(_PyUnicode_CHECK(unicode));
1493 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1494 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1495
1496 for (iter = begin; iter < end; ) {
1497 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1498 _PyUnicode_GET_LENGTH(unicode)));
1499 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1500 && (iter+1) < end
1501 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1502 {
1503 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1504 iter += 2;
1505 }
1506 else {
1507 *ucs4_out++ = *iter;
1508 iter++;
1509 }
1510 }
1511 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1512 _PyUnicode_GET_LENGTH(unicode)));
1513
1514 }
1515 #endif
1516
1517 static int
unicode_check_modifiable(PyObject * unicode)1518 unicode_check_modifiable(PyObject *unicode)
1519 {
1520 if (!unicode_modifiable(unicode)) {
1521 PyErr_SetString(PyExc_SystemError,
1522 "Cannot modify a string currently used");
1523 return -1;
1524 }
1525 return 0;
1526 }
1527
1528 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1529 _copy_characters(PyObject *to, Py_ssize_t to_start,
1530 PyObject *from, Py_ssize_t from_start,
1531 Py_ssize_t how_many, int check_maxchar)
1532 {
1533 unsigned int from_kind, to_kind;
1534 const void *from_data;
1535 void *to_data;
1536
1537 assert(0 <= how_many);
1538 assert(0 <= from_start);
1539 assert(0 <= to_start);
1540 assert(PyUnicode_Check(from));
1541 assert(PyUnicode_IS_READY(from));
1542 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1543
1544 assert(PyUnicode_Check(to));
1545 assert(PyUnicode_IS_READY(to));
1546 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1547
1548 if (how_many == 0)
1549 return 0;
1550
1551 from_kind = PyUnicode_KIND(from);
1552 from_data = PyUnicode_DATA(from);
1553 to_kind = PyUnicode_KIND(to);
1554 to_data = PyUnicode_DATA(to);
1555
1556 #ifdef Py_DEBUG
1557 if (!check_maxchar
1558 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1559 {
1560 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1561 Py_UCS4 ch;
1562 Py_ssize_t i;
1563 for (i=0; i < how_many; i++) {
1564 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1565 assert(ch <= to_maxchar);
1566 }
1567 }
1568 #endif
1569
1570 if (from_kind == to_kind) {
1571 if (check_maxchar
1572 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1573 {
1574 /* Writing Latin-1 characters into an ASCII string requires to
1575 check that all written characters are pure ASCII */
1576 Py_UCS4 max_char;
1577 max_char = ucs1lib_find_max_char(from_data,
1578 (const Py_UCS1*)from_data + how_many);
1579 if (max_char >= 128)
1580 return -1;
1581 }
1582 memcpy((char*)to_data + to_kind * to_start,
1583 (const char*)from_data + from_kind * from_start,
1584 to_kind * how_many);
1585 }
1586 else if (from_kind == PyUnicode_1BYTE_KIND
1587 && to_kind == PyUnicode_2BYTE_KIND)
1588 {
1589 _PyUnicode_CONVERT_BYTES(
1590 Py_UCS1, Py_UCS2,
1591 PyUnicode_1BYTE_DATA(from) + from_start,
1592 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1593 PyUnicode_2BYTE_DATA(to) + to_start
1594 );
1595 }
1596 else if (from_kind == PyUnicode_1BYTE_KIND
1597 && to_kind == PyUnicode_4BYTE_KIND)
1598 {
1599 _PyUnicode_CONVERT_BYTES(
1600 Py_UCS1, Py_UCS4,
1601 PyUnicode_1BYTE_DATA(from) + from_start,
1602 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1603 PyUnicode_4BYTE_DATA(to) + to_start
1604 );
1605 }
1606 else if (from_kind == PyUnicode_2BYTE_KIND
1607 && to_kind == PyUnicode_4BYTE_KIND)
1608 {
1609 _PyUnicode_CONVERT_BYTES(
1610 Py_UCS2, Py_UCS4,
1611 PyUnicode_2BYTE_DATA(from) + from_start,
1612 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1613 PyUnicode_4BYTE_DATA(to) + to_start
1614 );
1615 }
1616 else {
1617 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1618
1619 if (!check_maxchar) {
1620 if (from_kind == PyUnicode_2BYTE_KIND
1621 && to_kind == PyUnicode_1BYTE_KIND)
1622 {
1623 _PyUnicode_CONVERT_BYTES(
1624 Py_UCS2, Py_UCS1,
1625 PyUnicode_2BYTE_DATA(from) + from_start,
1626 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1627 PyUnicode_1BYTE_DATA(to) + to_start
1628 );
1629 }
1630 else if (from_kind == PyUnicode_4BYTE_KIND
1631 && to_kind == PyUnicode_1BYTE_KIND)
1632 {
1633 _PyUnicode_CONVERT_BYTES(
1634 Py_UCS4, Py_UCS1,
1635 PyUnicode_4BYTE_DATA(from) + from_start,
1636 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1637 PyUnicode_1BYTE_DATA(to) + to_start
1638 );
1639 }
1640 else if (from_kind == PyUnicode_4BYTE_KIND
1641 && to_kind == PyUnicode_2BYTE_KIND)
1642 {
1643 _PyUnicode_CONVERT_BYTES(
1644 Py_UCS4, Py_UCS2,
1645 PyUnicode_4BYTE_DATA(from) + from_start,
1646 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1647 PyUnicode_2BYTE_DATA(to) + to_start
1648 );
1649 }
1650 else {
1651 Py_UNREACHABLE();
1652 }
1653 }
1654 else {
1655 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1656 Py_UCS4 ch;
1657 Py_ssize_t i;
1658
1659 for (i=0; i < how_many; i++) {
1660 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1661 if (ch > to_maxchar)
1662 return -1;
1663 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1664 }
1665 }
1666 }
1667 return 0;
1668 }
1669
1670 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1671 _PyUnicode_FastCopyCharacters(
1672 PyObject *to, Py_ssize_t to_start,
1673 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1674 {
1675 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1676 }
1677
1678 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1679 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1680 PyObject *from, Py_ssize_t from_start,
1681 Py_ssize_t how_many)
1682 {
1683 int err;
1684
1685 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1686 PyErr_BadInternalCall();
1687 return -1;
1688 }
1689
1690 if (PyUnicode_READY(from) == -1)
1691 return -1;
1692 if (PyUnicode_READY(to) == -1)
1693 return -1;
1694
1695 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1696 PyErr_SetString(PyExc_IndexError, "string index out of range");
1697 return -1;
1698 }
1699 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1700 PyErr_SetString(PyExc_IndexError, "string index out of range");
1701 return -1;
1702 }
1703 if (how_many < 0) {
1704 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1705 return -1;
1706 }
1707 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1708 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1709 PyErr_Format(PyExc_SystemError,
1710 "Cannot write %zi characters at %zi "
1711 "in a string of %zi characters",
1712 how_many, to_start, PyUnicode_GET_LENGTH(to));
1713 return -1;
1714 }
1715
1716 if (how_many == 0)
1717 return 0;
1718
1719 if (unicode_check_modifiable(to))
1720 return -1;
1721
1722 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1723 if (err) {
1724 PyErr_Format(PyExc_SystemError,
1725 "Cannot copy %s characters "
1726 "into a string of %s characters",
1727 unicode_kind_name(from),
1728 unicode_kind_name(to));
1729 return -1;
1730 }
1731 return how_many;
1732 }
1733
1734 /* Find the maximum code point and count the number of surrogate pairs so a
1735 correct string length can be computed before converting a string to UCS4.
1736 This function counts single surrogates as a character and not as a pair.
1737
1738 Return 0 on success, or -1 on error. */
1739 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1740 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1741 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1742 {
1743 const wchar_t *iter;
1744 Py_UCS4 ch;
1745
1746 assert(num_surrogates != NULL && maxchar != NULL);
1747 *num_surrogates = 0;
1748 *maxchar = 0;
1749
1750 for (iter = begin; iter < end; ) {
1751 #if SIZEOF_WCHAR_T == 2
1752 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1753 && (iter+1) < end
1754 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1755 {
1756 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1757 ++(*num_surrogates);
1758 iter += 2;
1759 }
1760 else
1761 #endif
1762 {
1763 ch = *iter;
1764 iter++;
1765 }
1766 if (ch > *maxchar) {
1767 *maxchar = ch;
1768 if (*maxchar > MAX_UNICODE) {
1769 PyErr_Format(PyExc_ValueError,
1770 "character U+%x is not in range [U+0000; U+%x]",
1771 ch, MAX_UNICODE);
1772 return -1;
1773 }
1774 }
1775 }
1776 return 0;
1777 }
1778
1779 int
_PyUnicode_Ready(PyObject * unicode)1780 _PyUnicode_Ready(PyObject *unicode)
1781 {
1782 wchar_t *end;
1783 Py_UCS4 maxchar = 0;
1784 Py_ssize_t num_surrogates;
1785 #if SIZEOF_WCHAR_T == 2
1786 Py_ssize_t length_wo_surrogates;
1787 #endif
1788
1789 /* _PyUnicode_Ready() is only intended for old-style API usage where
1790 strings were created using _PyObject_New() and where no canonical
1791 representation (the str field) has been set yet aka strings
1792 which are not yet ready. */
1793 assert(_PyUnicode_CHECK(unicode));
1794 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1795 assert(_PyUnicode_WSTR(unicode) != NULL);
1796 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1797 assert(_PyUnicode_UTF8(unicode) == NULL);
1798 /* Actually, it should neither be interned nor be anything else: */
1799 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1800
1801 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1802 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1803 &maxchar, &num_surrogates) == -1)
1804 return -1;
1805
1806 if (maxchar < 256) {
1807 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1808 if (!_PyUnicode_DATA_ANY(unicode)) {
1809 PyErr_NoMemory();
1810 return -1;
1811 }
1812 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1813 _PyUnicode_WSTR(unicode), end,
1814 PyUnicode_1BYTE_DATA(unicode));
1815 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1816 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1818 if (maxchar < 128) {
1819 _PyUnicode_STATE(unicode).ascii = 1;
1820 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1821 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1822 }
1823 else {
1824 _PyUnicode_STATE(unicode).ascii = 0;
1825 _PyUnicode_UTF8(unicode) = NULL;
1826 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827 }
1828 PyObject_Free(_PyUnicode_WSTR(unicode));
1829 _PyUnicode_WSTR(unicode) = NULL;
1830 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1831 }
1832 /* In this case we might have to convert down from 4-byte native
1833 wchar_t to 2-byte unicode. */
1834 else if (maxchar < 65536) {
1835 assert(num_surrogates == 0 &&
1836 "FindMaxCharAndNumSurrogatePairs() messed up");
1837
1838 #if SIZEOF_WCHAR_T == 2
1839 /* We can share representations and are done. */
1840 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1841 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1842 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1843 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1844 _PyUnicode_UTF8(unicode) = NULL;
1845 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1846 #else
1847 /* sizeof(wchar_t) == 4 */
1848 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
1849 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1850 if (!_PyUnicode_DATA_ANY(unicode)) {
1851 PyErr_NoMemory();
1852 return -1;
1853 }
1854 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1855 _PyUnicode_WSTR(unicode), end,
1856 PyUnicode_2BYTE_DATA(unicode));
1857 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1858 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1859 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1860 _PyUnicode_UTF8(unicode) = NULL;
1861 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1862 PyObject_Free(_PyUnicode_WSTR(unicode));
1863 _PyUnicode_WSTR(unicode) = NULL;
1864 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1865 #endif
1866 }
1867 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
1868 else {
1869 #if SIZEOF_WCHAR_T == 2
1870 /* in case the native representation is 2-bytes, we need to allocate a
1871 new normalized 4-byte version. */
1872 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1873 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1874 PyErr_NoMemory();
1875 return -1;
1876 }
1877 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
1878 if (!_PyUnicode_DATA_ANY(unicode)) {
1879 PyErr_NoMemory();
1880 return -1;
1881 }
1882 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1883 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1884 _PyUnicode_UTF8(unicode) = NULL;
1885 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1886 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1887 _PyUnicode_STATE(unicode).ready = 1;
1888 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1889 PyObject_Free(_PyUnicode_WSTR(unicode));
1890 _PyUnicode_WSTR(unicode) = NULL;
1891 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1892 #else
1893 assert(num_surrogates == 0);
1894
1895 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1896 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1897 _PyUnicode_UTF8(unicode) = NULL;
1898 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1899 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1900 #endif
1901 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1902 }
1903 _PyUnicode_STATE(unicode).ready = 1;
1904 assert(_PyUnicode_CheckConsistency(unicode, 1));
1905 return 0;
1906 }
1907
1908 static void
unicode_dealloc(PyObject * unicode)1909 unicode_dealloc(PyObject *unicode)
1910 {
1911 #ifdef Py_DEBUG
1912 if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1913 _Py_FatalRefcountError("deallocating an Unicode singleton");
1914 }
1915 #endif
1916
1917 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1918 case SSTATE_NOT_INTERNED:
1919 break;
1920 case SSTATE_INTERNED_MORTAL:
1921 {
1922 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1923 references (key and value) which were ignored by
1924 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1925 to prevent calling unicode_dealloc() again. Adjust refcnt after
1926 PyDict_DelItem(). */
1927 assert(Py_REFCNT(unicode) == 0);
1928 Py_SET_REFCNT(unicode, 3);
1929 if (PyDict_DelItem(interned, unicode) != 0) {
1930 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1931 NULL);
1932 }
1933 assert(Py_REFCNT(unicode) == 1);
1934 Py_SET_REFCNT(unicode, 0);
1935 break;
1936 }
1937
1938 case SSTATE_INTERNED_IMMORTAL:
1939 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1940 break;
1941
1942 default:
1943 Py_UNREACHABLE();
1944 }
1945
1946 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1947 PyObject_Free(_PyUnicode_WSTR(unicode));
1948 }
1949 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1950 PyObject_Free(_PyUnicode_UTF8(unicode));
1951 }
1952 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1953 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
1954 }
1955
1956 Py_TYPE(unicode)->tp_free(unicode);
1957 }
1958
1959 #ifdef Py_DEBUG
1960 static int
unicode_is_singleton(PyObject * unicode)1961 unicode_is_singleton(PyObject *unicode)
1962 {
1963 if (unicode == &_Py_STR(empty)) {
1964 return 1;
1965 }
1966
1967 PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1968 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) {
1969 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1970 if (ch < 256 && LATIN1(ch) == unicode) {
1971 return 1;
1972 }
1973 }
1974 return 0;
1975 }
1976 #endif
1977
1978 static int
unicode_modifiable(PyObject * unicode)1979 unicode_modifiable(PyObject *unicode)
1980 {
1981 assert(_PyUnicode_CHECK(unicode));
1982 if (Py_REFCNT(unicode) != 1)
1983 return 0;
1984 if (_PyUnicode_HASH(unicode) != -1)
1985 return 0;
1986 if (PyUnicode_CHECK_INTERNED(unicode))
1987 return 0;
1988 if (!PyUnicode_CheckExact(unicode))
1989 return 0;
1990 #ifdef Py_DEBUG
1991 /* singleton refcount is greater than 1 */
1992 assert(!unicode_is_singleton(unicode));
1993 #endif
1994 return 1;
1995 }
1996
1997 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1998 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1999 {
2000 PyObject *unicode;
2001 Py_ssize_t old_length;
2002
2003 assert(p_unicode != NULL);
2004 unicode = *p_unicode;
2005
2006 assert(unicode != NULL);
2007 assert(PyUnicode_Check(unicode));
2008 assert(0 <= length);
2009
2010 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
2011 old_length = PyUnicode_WSTR_LENGTH(unicode);
2012 else
2013 old_length = PyUnicode_GET_LENGTH(unicode);
2014 if (old_length == length)
2015 return 0;
2016
2017 if (length == 0) {
2018 PyObject *empty = unicode_new_empty();
2019 Py_SETREF(*p_unicode, empty);
2020 return 0;
2021 }
2022
2023 if (!unicode_modifiable(unicode)) {
2024 PyObject *copy = resize_copy(unicode, length);
2025 if (copy == NULL)
2026 return -1;
2027 Py_SETREF(*p_unicode, copy);
2028 return 0;
2029 }
2030
2031 if (PyUnicode_IS_COMPACT(unicode)) {
2032 PyObject *new_unicode = resize_compact(unicode, length);
2033 if (new_unicode == NULL)
2034 return -1;
2035 *p_unicode = new_unicode;
2036 return 0;
2037 }
2038 return resize_inplace(unicode, length);
2039 }
2040
2041 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)2042 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
2043 {
2044 PyObject *unicode;
2045 if (p_unicode == NULL) {
2046 PyErr_BadInternalCall();
2047 return -1;
2048 }
2049 unicode = *p_unicode;
2050 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
2051 {
2052 PyErr_BadInternalCall();
2053 return -1;
2054 }
2055 return unicode_resize(p_unicode, length);
2056 }
2057
2058 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
2059
2060 WARNING: The function doesn't copy the terminating null character and
2061 doesn't check the maximum character (may write a latin1 character in an
2062 ASCII string). */
2063 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)2064 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2065 const char *str, Py_ssize_t len)
2066 {
2067 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2068 const void *data = PyUnicode_DATA(unicode);
2069 const char *end = str + len;
2070
2071 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
2072 switch (kind) {
2073 case PyUnicode_1BYTE_KIND: {
2074 #ifdef Py_DEBUG
2075 if (PyUnicode_IS_ASCII(unicode)) {
2076 Py_UCS4 maxchar = ucs1lib_find_max_char(
2077 (const Py_UCS1*)str,
2078 (const Py_UCS1*)str + len);
2079 assert(maxchar < 128);
2080 }
2081 #endif
2082 memcpy((char *) data + index, str, len);
2083 break;
2084 }
2085 case PyUnicode_2BYTE_KIND: {
2086 Py_UCS2 *start = (Py_UCS2 *)data + index;
2087 Py_UCS2 *ucs2 = start;
2088
2089 for (; str < end; ++ucs2, ++str)
2090 *ucs2 = (Py_UCS2)*str;
2091
2092 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2093 break;
2094 }
2095 case PyUnicode_4BYTE_KIND: {
2096 Py_UCS4 *start = (Py_UCS4 *)data + index;
2097 Py_UCS4 *ucs4 = start;
2098
2099 for (; str < end; ++ucs4, ++str)
2100 *ucs4 = (Py_UCS4)*str;
2101
2102 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2103 break;
2104 }
2105 default:
2106 Py_UNREACHABLE();
2107 }
2108 }
2109
2110 static PyObject*
get_latin1_char(Py_UCS1 ch)2111 get_latin1_char(Py_UCS1 ch)
2112 {
2113 return Py_NewRef(LATIN1(ch));
2114 }
2115
2116 static PyObject*
unicode_char(Py_UCS4 ch)2117 unicode_char(Py_UCS4 ch)
2118 {
2119 PyObject *unicode;
2120
2121 assert(ch <= MAX_UNICODE);
2122
2123 if (ch < 256) {
2124 return get_latin1_char(ch);
2125 }
2126
2127 unicode = PyUnicode_New(1, ch);
2128 if (unicode == NULL)
2129 return NULL;
2130
2131 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2132 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2133 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2134 } else {
2135 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2136 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2137 }
2138 assert(_PyUnicode_CheckConsistency(unicode, 1));
2139 return unicode;
2140 }
2141
2142 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2143 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2144 {
2145 if (u == NULL) {
2146 if (size > 0) {
2147 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2148 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2149 "use PyUnicode_New() instead", 1) < 0) {
2150 return NULL;
2151 }
2152 }
2153 return (PyObject*)_PyUnicode_New(size);
2154 }
2155
2156 if (size < 0) {
2157 PyErr_BadInternalCall();
2158 return NULL;
2159 }
2160
2161 return PyUnicode_FromWideChar(u, size);
2162 }
2163
2164 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2165 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2166 {
2167 PyObject *unicode;
2168 Py_UCS4 maxchar = 0;
2169 Py_ssize_t num_surrogates;
2170
2171 if (u == NULL && size != 0) {
2172 PyErr_BadInternalCall();
2173 return NULL;
2174 }
2175
2176 if (size == -1) {
2177 size = wcslen(u);
2178 }
2179
2180 /* If the Unicode data is known at construction time, we can apply
2181 some optimizations which share commonly used objects. */
2182
2183 /* Optimization for empty strings */
2184 if (size == 0)
2185 _Py_RETURN_UNICODE_EMPTY();
2186
2187 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2188 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2189 non-Unicode locales and hence needs conversion to UCS-4 first. */
2190 if (_Py_LocaleUsesNonUnicodeWchar()) {
2191 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2192 if (!converted) {
2193 return NULL;
2194 }
2195 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2196 PyMem_Free(converted);
2197 return unicode;
2198 }
2199 #endif
2200
2201 /* Single character Unicode objects in the Latin-1 range are
2202 shared when using this constructor */
2203 if (size == 1 && (Py_UCS4)*u < 256)
2204 return get_latin1_char((unsigned char)*u);
2205
2206 /* If not empty and not single character, copy the Unicode data
2207 into the new object */
2208 if (find_maxchar_surrogates(u, u + size,
2209 &maxchar, &num_surrogates) == -1)
2210 return NULL;
2211
2212 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2213 if (!unicode)
2214 return NULL;
2215
2216 switch (PyUnicode_KIND(unicode)) {
2217 case PyUnicode_1BYTE_KIND:
2218 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2219 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2220 break;
2221 case PyUnicode_2BYTE_KIND:
2222 #if Py_UNICODE_SIZE == 2
2223 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2224 #else
2225 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2226 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2227 #endif
2228 break;
2229 case PyUnicode_4BYTE_KIND:
2230 #if SIZEOF_WCHAR_T == 2
2231 /* This is the only case which has to process surrogates, thus
2232 a simple copy loop is not enough and we need a function. */
2233 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2234 #else
2235 assert(num_surrogates == 0);
2236 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2237 #endif
2238 break;
2239 default:
2240 Py_UNREACHABLE();
2241 }
2242
2243 return unicode_result(unicode);
2244 }
2245
2246 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2247 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2248 {
2249 if (size < 0) {
2250 PyErr_SetString(PyExc_SystemError,
2251 "Negative size passed to PyUnicode_FromStringAndSize");
2252 return NULL;
2253 }
2254 if (u != NULL) {
2255 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2256 }
2257 else {
2258 if (size > 0) {
2259 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2260 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2261 "use PyUnicode_New() instead", 1) < 0) {
2262 return NULL;
2263 }
2264 }
2265 return (PyObject *)_PyUnicode_New(size);
2266 }
2267 }
2268
2269 PyObject *
PyUnicode_FromString(const char * u)2270 PyUnicode_FromString(const char *u)
2271 {
2272 size_t size = strlen(u);
2273 if (size > PY_SSIZE_T_MAX) {
2274 PyErr_SetString(PyExc_OverflowError, "input too long");
2275 return NULL;
2276 }
2277 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2278 }
2279
2280
2281 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2282 _PyUnicode_FromId(_Py_Identifier *id)
2283 {
2284 PyInterpreterState *interp = _PyInterpreterState_GET();
2285 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2286
2287 Py_ssize_t index = _Py_atomic_size_get(&id->index);
2288 if (index < 0) {
2289 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2290
2291 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2292 // Check again to detect concurrent access. Another thread can have
2293 // initialized the index while this thread waited for the lock.
2294 index = _Py_atomic_size_get(&id->index);
2295 if (index < 0) {
2296 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2297 index = rt_ids->next_index;
2298 rt_ids->next_index++;
2299 _Py_atomic_size_set(&id->index, index);
2300 }
2301 PyThread_release_lock(rt_ids->lock);
2302 }
2303 assert(index >= 0);
2304
2305 PyObject *obj;
2306 if (index < ids->size) {
2307 obj = ids->array[index];
2308 if (obj) {
2309 // Return a borrowed reference
2310 return obj;
2311 }
2312 }
2313
2314 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2315 NULL, NULL);
2316 if (!obj) {
2317 return NULL;
2318 }
2319 PyUnicode_InternInPlace(&obj);
2320
2321 if (index >= ids->size) {
2322 // Overallocate to reduce the number of realloc
2323 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2324 Py_ssize_t item_size = sizeof(ids->array[0]);
2325 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2326 if (new_array == NULL) {
2327 PyErr_NoMemory();
2328 return NULL;
2329 }
2330 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2331 ids->array = new_array;
2332 ids->size = new_size;
2333 }
2334
2335 // The array stores a strong reference
2336 ids->array[index] = obj;
2337
2338 // Return a borrowed reference
2339 return obj;
2340 }
2341
2342
2343 static void
unicode_clear_identifiers(struct _Py_unicode_state * state)2344 unicode_clear_identifiers(struct _Py_unicode_state *state)
2345 {
2346 struct _Py_unicode_ids *ids = &state->ids;
2347 for (Py_ssize_t i=0; i < ids->size; i++) {
2348 Py_XDECREF(ids->array[i]);
2349 }
2350 ids->size = 0;
2351 PyMem_Free(ids->array);
2352 ids->array = NULL;
2353 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2354 // after Py_Finalize().
2355 }
2356
2357
2358 /* Internal function, doesn't check maximum character */
2359
2360 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2361 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2362 {
2363 const unsigned char *s = (const unsigned char *)buffer;
2364 PyObject *unicode;
2365 if (size == 1) {
2366 #ifdef Py_DEBUG
2367 assert((unsigned char)s[0] < 128);
2368 #endif
2369 return get_latin1_char(s[0]);
2370 }
2371 unicode = PyUnicode_New(size, 127);
2372 if (!unicode)
2373 return NULL;
2374 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2375 assert(_PyUnicode_CheckConsistency(unicode, 1));
2376 return unicode;
2377 }
2378
2379 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2380 kind_maxchar_limit(unsigned int kind)
2381 {
2382 switch (kind) {
2383 case PyUnicode_1BYTE_KIND:
2384 return 0x80;
2385 case PyUnicode_2BYTE_KIND:
2386 return 0x100;
2387 case PyUnicode_4BYTE_KIND:
2388 return 0x10000;
2389 default:
2390 Py_UNREACHABLE();
2391 }
2392 }
2393
2394 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2395 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2396 {
2397 PyObject *res;
2398 unsigned char max_char;
2399
2400 if (size == 0) {
2401 _Py_RETURN_UNICODE_EMPTY();
2402 }
2403 assert(size > 0);
2404 if (size == 1) {
2405 return get_latin1_char(u[0]);
2406 }
2407
2408 max_char = ucs1lib_find_max_char(u, u + size);
2409 res = PyUnicode_New(size, max_char);
2410 if (!res)
2411 return NULL;
2412 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2413 assert(_PyUnicode_CheckConsistency(res, 1));
2414 return res;
2415 }
2416
2417 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2418 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2419 {
2420 PyObject *res;
2421 Py_UCS2 max_char;
2422
2423 if (size == 0)
2424 _Py_RETURN_UNICODE_EMPTY();
2425 assert(size > 0);
2426 if (size == 1)
2427 return unicode_char(u[0]);
2428
2429 max_char = ucs2lib_find_max_char(u, u + size);
2430 res = PyUnicode_New(size, max_char);
2431 if (!res)
2432 return NULL;
2433 if (max_char >= 256)
2434 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2435 else {
2436 _PyUnicode_CONVERT_BYTES(
2437 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2438 }
2439 assert(_PyUnicode_CheckConsistency(res, 1));
2440 return res;
2441 }
2442
2443 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2444 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2445 {
2446 PyObject *res;
2447 Py_UCS4 max_char;
2448
2449 if (size == 0)
2450 _Py_RETURN_UNICODE_EMPTY();
2451 assert(size > 0);
2452 if (size == 1)
2453 return unicode_char(u[0]);
2454
2455 max_char = ucs4lib_find_max_char(u, u + size);
2456 res = PyUnicode_New(size, max_char);
2457 if (!res)
2458 return NULL;
2459 if (max_char < 256)
2460 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2461 PyUnicode_1BYTE_DATA(res));
2462 else if (max_char < 0x10000)
2463 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2464 PyUnicode_2BYTE_DATA(res));
2465 else
2466 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2467 assert(_PyUnicode_CheckConsistency(res, 1));
2468 return res;
2469 }
2470
2471 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2472 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2473 {
2474 if (size < 0) {
2475 PyErr_SetString(PyExc_ValueError, "size must be positive");
2476 return NULL;
2477 }
2478 switch (kind) {
2479 case PyUnicode_1BYTE_KIND:
2480 return _PyUnicode_FromUCS1(buffer, size);
2481 case PyUnicode_2BYTE_KIND:
2482 return _PyUnicode_FromUCS2(buffer, size);
2483 case PyUnicode_4BYTE_KIND:
2484 return _PyUnicode_FromUCS4(buffer, size);
2485 default:
2486 PyErr_SetString(PyExc_SystemError, "invalid kind");
2487 return NULL;
2488 }
2489 }
2490
2491 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2492 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2493 {
2494 enum PyUnicode_Kind kind;
2495 const void *startptr, *endptr;
2496
2497 assert(PyUnicode_IS_READY(unicode));
2498 assert(0 <= start);
2499 assert(end <= PyUnicode_GET_LENGTH(unicode));
2500 assert(start <= end);
2501
2502 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2503 return PyUnicode_MAX_CHAR_VALUE(unicode);
2504
2505 if (start == end)
2506 return 127;
2507
2508 if (PyUnicode_IS_ASCII(unicode))
2509 return 127;
2510
2511 kind = PyUnicode_KIND(unicode);
2512 startptr = PyUnicode_DATA(unicode);
2513 endptr = (char *)startptr + end * kind;
2514 startptr = (char *)startptr + start * kind;
2515 switch(kind) {
2516 case PyUnicode_1BYTE_KIND:
2517 return ucs1lib_find_max_char(startptr, endptr);
2518 case PyUnicode_2BYTE_KIND:
2519 return ucs2lib_find_max_char(startptr, endptr);
2520 case PyUnicode_4BYTE_KIND:
2521 return ucs4lib_find_max_char(startptr, endptr);
2522 default:
2523 Py_UNREACHABLE();
2524 }
2525 }
2526
2527 /* Ensure that a string uses the most efficient storage, if it is not the
2528 case: create a new string with of the right kind. Write NULL into *p_unicode
2529 on error. */
2530 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2531 unicode_adjust_maxchar(PyObject **p_unicode)
2532 {
2533 PyObject *unicode, *copy;
2534 Py_UCS4 max_char;
2535 Py_ssize_t len;
2536 unsigned int kind;
2537
2538 assert(p_unicode != NULL);
2539 unicode = *p_unicode;
2540 assert(PyUnicode_IS_READY(unicode));
2541 if (PyUnicode_IS_ASCII(unicode))
2542 return;
2543
2544 len = PyUnicode_GET_LENGTH(unicode);
2545 kind = PyUnicode_KIND(unicode);
2546 if (kind == PyUnicode_1BYTE_KIND) {
2547 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2548 max_char = ucs1lib_find_max_char(u, u + len);
2549 if (max_char >= 128)
2550 return;
2551 }
2552 else if (kind == PyUnicode_2BYTE_KIND) {
2553 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2554 max_char = ucs2lib_find_max_char(u, u + len);
2555 if (max_char >= 256)
2556 return;
2557 }
2558 else if (kind == PyUnicode_4BYTE_KIND) {
2559 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2560 max_char = ucs4lib_find_max_char(u, u + len);
2561 if (max_char >= 0x10000)
2562 return;
2563 }
2564 else
2565 Py_UNREACHABLE();
2566
2567 copy = PyUnicode_New(len, max_char);
2568 if (copy != NULL)
2569 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2570 Py_DECREF(unicode);
2571 *p_unicode = copy;
2572 }
2573
2574 PyObject*
_PyUnicode_Copy(PyObject * unicode)2575 _PyUnicode_Copy(PyObject *unicode)
2576 {
2577 Py_ssize_t length;
2578 PyObject *copy;
2579
2580 if (!PyUnicode_Check(unicode)) {
2581 PyErr_BadInternalCall();
2582 return NULL;
2583 }
2584 if (PyUnicode_READY(unicode) == -1)
2585 return NULL;
2586
2587 length = PyUnicode_GET_LENGTH(unicode);
2588 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2589 if (!copy)
2590 return NULL;
2591 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2592
2593 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2594 length * PyUnicode_KIND(unicode));
2595 assert(_PyUnicode_CheckConsistency(copy, 1));
2596 return copy;
2597 }
2598
2599
2600 /* Widen Unicode objects to larger buffers. Don't write terminating null
2601 character. Return NULL on error. */
2602
2603 static void*
unicode_askind(unsigned int skind,void const * data,Py_ssize_t len,unsigned int kind)2604 unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
2605 {
2606 void *result;
2607
2608 assert(skind < kind);
2609 switch (kind) {
2610 case PyUnicode_2BYTE_KIND:
2611 result = PyMem_New(Py_UCS2, len);
2612 if (!result)
2613 return PyErr_NoMemory();
2614 assert(skind == PyUnicode_1BYTE_KIND);
2615 _PyUnicode_CONVERT_BYTES(
2616 Py_UCS1, Py_UCS2,
2617 (const Py_UCS1 *)data,
2618 ((const Py_UCS1 *)data) + len,
2619 result);
2620 return result;
2621 case PyUnicode_4BYTE_KIND:
2622 result = PyMem_New(Py_UCS4, len);
2623 if (!result)
2624 return PyErr_NoMemory();
2625 if (skind == PyUnicode_2BYTE_KIND) {
2626 _PyUnicode_CONVERT_BYTES(
2627 Py_UCS2, Py_UCS4,
2628 (const Py_UCS2 *)data,
2629 ((const Py_UCS2 *)data) + len,
2630 result);
2631 }
2632 else {
2633 assert(skind == PyUnicode_1BYTE_KIND);
2634 _PyUnicode_CONVERT_BYTES(
2635 Py_UCS1, Py_UCS4,
2636 (const Py_UCS1 *)data,
2637 ((const Py_UCS1 *)data) + len,
2638 result);
2639 }
2640 return result;
2641 default:
2642 Py_UNREACHABLE();
2643 return NULL;
2644 }
2645 }
2646
2647 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2648 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2649 int copy_null)
2650 {
2651 int kind;
2652 const void *data;
2653 Py_ssize_t len, targetlen;
2654 if (PyUnicode_READY(string) == -1)
2655 return NULL;
2656 kind = PyUnicode_KIND(string);
2657 data = PyUnicode_DATA(string);
2658 len = PyUnicode_GET_LENGTH(string);
2659 targetlen = len;
2660 if (copy_null)
2661 targetlen++;
2662 if (!target) {
2663 target = PyMem_New(Py_UCS4, targetlen);
2664 if (!target) {
2665 PyErr_NoMemory();
2666 return NULL;
2667 }
2668 }
2669 else {
2670 if (targetsize < targetlen) {
2671 PyErr_Format(PyExc_SystemError,
2672 "string is longer than the buffer");
2673 if (copy_null && 0 < targetsize)
2674 target[0] = 0;
2675 return NULL;
2676 }
2677 }
2678 if (kind == PyUnicode_1BYTE_KIND) {
2679 const Py_UCS1 *start = (const Py_UCS1 *) data;
2680 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2681 }
2682 else if (kind == PyUnicode_2BYTE_KIND) {
2683 const Py_UCS2 *start = (const Py_UCS2 *) data;
2684 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2685 }
2686 else if (kind == PyUnicode_4BYTE_KIND) {
2687 memcpy(target, data, len * sizeof(Py_UCS4));
2688 }
2689 else {
2690 Py_UNREACHABLE();
2691 }
2692 if (copy_null)
2693 target[len] = 0;
2694 return target;
2695 }
2696
2697 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2698 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2699 int copy_null)
2700 {
2701 if (target == NULL || targetsize < 0) {
2702 PyErr_BadInternalCall();
2703 return NULL;
2704 }
2705 return as_ucs4(string, target, targetsize, copy_null);
2706 }
2707
2708 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2709 PyUnicode_AsUCS4Copy(PyObject *string)
2710 {
2711 return as_ucs4(string, NULL, 0, 1);
2712 }
2713
2714 /* maximum number of characters required for output of %lld or %p.
2715 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2716 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2717 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2718
2719 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2720 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2721 Py_ssize_t width, Py_ssize_t precision)
2722 {
2723 Py_ssize_t length, fill, arglen;
2724 Py_UCS4 maxchar;
2725
2726 if (PyUnicode_READY(str) == -1)
2727 return -1;
2728
2729 length = PyUnicode_GET_LENGTH(str);
2730 if ((precision == -1 || precision >= length)
2731 && width <= length)
2732 return _PyUnicodeWriter_WriteStr(writer, str);
2733
2734 if (precision != -1)
2735 length = Py_MIN(precision, length);
2736
2737 arglen = Py_MAX(length, width);
2738 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2739 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2740 else
2741 maxchar = writer->maxchar;
2742
2743 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2744 return -1;
2745
2746 if (width > length) {
2747 fill = width - length;
2748 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2749 return -1;
2750 writer->pos += fill;
2751 }
2752
2753 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2754 str, 0, length);
2755 writer->pos += length;
2756 return 0;
2757 }
2758
2759 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2760 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2761 Py_ssize_t width, Py_ssize_t precision)
2762 {
2763 /* UTF-8 */
2764 Py_ssize_t length;
2765 PyObject *unicode;
2766 int res;
2767
2768 if (precision == -1) {
2769 length = strlen(str);
2770 }
2771 else {
2772 length = 0;
2773 while (length < precision && str[length]) {
2774 length++;
2775 }
2776 }
2777 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2778 if (unicode == NULL)
2779 return -1;
2780
2781 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2782 Py_DECREF(unicode);
2783 return res;
2784 }
2785
2786 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2787 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2788 const char *f, va_list *vargs)
2789 {
2790 const char *p;
2791 Py_ssize_t len;
2792 int zeropad;
2793 Py_ssize_t width;
2794 Py_ssize_t precision;
2795 int longflag;
2796 int longlongflag;
2797 int size_tflag;
2798 Py_ssize_t fill;
2799
2800 p = f;
2801 f++;
2802 zeropad = 0;
2803 if (*f == '0') {
2804 zeropad = 1;
2805 f++;
2806 }
2807
2808 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2809 width = -1;
2810 if (Py_ISDIGIT((unsigned)*f)) {
2811 width = *f - '0';
2812 f++;
2813 while (Py_ISDIGIT((unsigned)*f)) {
2814 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2815 PyErr_SetString(PyExc_ValueError,
2816 "width too big");
2817 return NULL;
2818 }
2819 width = (width * 10) + (*f - '0');
2820 f++;
2821 }
2822 }
2823 precision = -1;
2824 if (*f == '.') {
2825 f++;
2826 if (Py_ISDIGIT((unsigned)*f)) {
2827 precision = (*f - '0');
2828 f++;
2829 while (Py_ISDIGIT((unsigned)*f)) {
2830 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2831 PyErr_SetString(PyExc_ValueError,
2832 "precision too big");
2833 return NULL;
2834 }
2835 precision = (precision * 10) + (*f - '0');
2836 f++;
2837 }
2838 }
2839 if (*f == '%') {
2840 /* "%.3%s" => f points to "3" */
2841 f--;
2842 }
2843 }
2844 if (*f == '\0') {
2845 /* bogus format "%.123" => go backward, f points to "3" */
2846 f--;
2847 }
2848
2849 /* Handle %ld, %lu, %lld and %llu. */
2850 longflag = 0;
2851 longlongflag = 0;
2852 size_tflag = 0;
2853 if (*f == 'l') {
2854 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2855 longflag = 1;
2856 ++f;
2857 }
2858 else if (f[1] == 'l' &&
2859 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2860 longlongflag = 1;
2861 f += 2;
2862 }
2863 }
2864 /* handle the size_t flag. */
2865 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2866 size_tflag = 1;
2867 ++f;
2868 }
2869
2870 if (f[1] == '\0')
2871 writer->overallocate = 0;
2872
2873 switch (*f) {
2874 case 'c':
2875 {
2876 int ordinal = va_arg(*vargs, int);
2877 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2878 PyErr_SetString(PyExc_OverflowError,
2879 "character argument not in range(0x110000)");
2880 return NULL;
2881 }
2882 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2883 return NULL;
2884 break;
2885 }
2886
2887 case 'i':
2888 case 'd':
2889 case 'u':
2890 case 'x':
2891 {
2892 /* used by sprintf */
2893 char buffer[MAX_LONG_LONG_CHARS];
2894 Py_ssize_t arglen;
2895
2896 if (*f == 'u') {
2897 if (longflag) {
2898 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2899 }
2900 else if (longlongflag) {
2901 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2902 }
2903 else if (size_tflag) {
2904 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2905 }
2906 else {
2907 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2908 }
2909 }
2910 else if (*f == 'x') {
2911 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2912 }
2913 else {
2914 if (longflag) {
2915 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2916 }
2917 else if (longlongflag) {
2918 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2919 }
2920 else if (size_tflag) {
2921 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2922 }
2923 else {
2924 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2925 }
2926 }
2927 assert(len >= 0);
2928
2929 if (precision < len)
2930 precision = len;
2931
2932 arglen = Py_MAX(precision, width);
2933 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2934 return NULL;
2935
2936 if (width > precision) {
2937 Py_UCS4 fillchar;
2938 fill = width - precision;
2939 fillchar = zeropad?'0':' ';
2940 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2941 return NULL;
2942 writer->pos += fill;
2943 }
2944 if (precision > len) {
2945 fill = precision - len;
2946 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2947 return NULL;
2948 writer->pos += fill;
2949 }
2950
2951 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2952 return NULL;
2953 break;
2954 }
2955
2956 case 'p':
2957 {
2958 char number[MAX_LONG_LONG_CHARS];
2959
2960 len = sprintf(number, "%p", va_arg(*vargs, void*));
2961 assert(len >= 0);
2962
2963 /* %p is ill-defined: ensure leading 0x. */
2964 if (number[1] == 'X')
2965 number[1] = 'x';
2966 else if (number[1] != 'x') {
2967 memmove(number + 2, number,
2968 strlen(number) + 1);
2969 number[0] = '0';
2970 number[1] = 'x';
2971 len += 2;
2972 }
2973
2974 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2975 return NULL;
2976 break;
2977 }
2978
2979 case 's':
2980 {
2981 /* UTF-8 */
2982 const char *s = va_arg(*vargs, const char*);
2983 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2984 return NULL;
2985 break;
2986 }
2987
2988 case 'U':
2989 {
2990 PyObject *obj = va_arg(*vargs, PyObject *);
2991 assert(obj && _PyUnicode_CHECK(obj));
2992
2993 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2994 return NULL;
2995 break;
2996 }
2997
2998 case 'V':
2999 {
3000 PyObject *obj = va_arg(*vargs, PyObject *);
3001 const char *str = va_arg(*vargs, const char *);
3002 if (obj) {
3003 assert(_PyUnicode_CHECK(obj));
3004 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
3005 return NULL;
3006 }
3007 else {
3008 assert(str != NULL);
3009 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
3010 return NULL;
3011 }
3012 break;
3013 }
3014
3015 case 'S':
3016 {
3017 PyObject *obj = va_arg(*vargs, PyObject *);
3018 PyObject *str;
3019 assert(obj);
3020 str = PyObject_Str(obj);
3021 if (!str)
3022 return NULL;
3023 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
3024 Py_DECREF(str);
3025 return NULL;
3026 }
3027 Py_DECREF(str);
3028 break;
3029 }
3030
3031 case 'R':
3032 {
3033 PyObject *obj = va_arg(*vargs, PyObject *);
3034 PyObject *repr;
3035 assert(obj);
3036 repr = PyObject_Repr(obj);
3037 if (!repr)
3038 return NULL;
3039 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
3040 Py_DECREF(repr);
3041 return NULL;
3042 }
3043 Py_DECREF(repr);
3044 break;
3045 }
3046
3047 case 'A':
3048 {
3049 PyObject *obj = va_arg(*vargs, PyObject *);
3050 PyObject *ascii;
3051 assert(obj);
3052 ascii = PyObject_ASCII(obj);
3053 if (!ascii)
3054 return NULL;
3055 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
3056 Py_DECREF(ascii);
3057 return NULL;
3058 }
3059 Py_DECREF(ascii);
3060 break;
3061 }
3062
3063 case '%':
3064 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
3065 return NULL;
3066 break;
3067
3068 default:
3069 /* if we stumble upon an unknown formatting code, copy the rest
3070 of the format string to the output string. (we cannot just
3071 skip the code, since there's no way to know what's in the
3072 argument list) */
3073 len = strlen(p);
3074 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
3075 return NULL;
3076 f = p+len;
3077 return f;
3078 }
3079
3080 f++;
3081 return f;
3082 }
3083
3084 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)3085 PyUnicode_FromFormatV(const char *format, va_list vargs)
3086 {
3087 va_list vargs2;
3088 const char *f;
3089 _PyUnicodeWriter writer;
3090
3091 _PyUnicodeWriter_Init(&writer);
3092 writer.min_length = strlen(format) + 100;
3093 writer.overallocate = 1;
3094
3095 // Copy varags to be able to pass a reference to a subfunction.
3096 va_copy(vargs2, vargs);
3097
3098 for (f = format; *f; ) {
3099 if (*f == '%') {
3100 f = unicode_fromformat_arg(&writer, f, &vargs2);
3101 if (f == NULL)
3102 goto fail;
3103 }
3104 else {
3105 const char *p;
3106 Py_ssize_t len;
3107
3108 p = f;
3109 do
3110 {
3111 if ((unsigned char)*p > 127) {
3112 PyErr_Format(PyExc_ValueError,
3113 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3114 "string, got a non-ASCII byte: 0x%02x",
3115 (unsigned char)*p);
3116 goto fail;
3117 }
3118 p++;
3119 }
3120 while (*p != '\0' && *p != '%');
3121 len = p - f;
3122
3123 if (*p == '\0')
3124 writer.overallocate = 0;
3125
3126 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
3127 goto fail;
3128
3129 f = p;
3130 }
3131 }
3132 va_end(vargs2);
3133 return _PyUnicodeWriter_Finish(&writer);
3134
3135 fail:
3136 va_end(vargs2);
3137 _PyUnicodeWriter_Dealloc(&writer);
3138 return NULL;
3139 }
3140
3141 PyObject *
PyUnicode_FromFormat(const char * format,...)3142 PyUnicode_FromFormat(const char *format, ...)
3143 {
3144 PyObject* ret;
3145 va_list vargs;
3146
3147 #ifdef HAVE_STDARG_PROTOTYPES
3148 va_start(vargs, format);
3149 #else
3150 va_start(vargs);
3151 #endif
3152 ret = PyUnicode_FromFormatV(format, vargs);
3153 va_end(vargs);
3154 return ret;
3155 }
3156
3157 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3158 unicode_get_widechar_size(PyObject *unicode)
3159 {
3160 Py_ssize_t res;
3161
3162 assert(unicode != NULL);
3163 assert(_PyUnicode_CHECK(unicode));
3164
3165 #if USE_UNICODE_WCHAR_CACHE
3166 if (_PyUnicode_WSTR(unicode) != NULL) {
3167 return PyUnicode_WSTR_LENGTH(unicode);
3168 }
3169 #endif /* USE_UNICODE_WCHAR_CACHE */
3170 assert(PyUnicode_IS_READY(unicode));
3171
3172 res = _PyUnicode_LENGTH(unicode);
3173 #if SIZEOF_WCHAR_T == 2
3174 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3175 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3176 const Py_UCS4 *end = s + res;
3177 for (; s < end; ++s) {
3178 if (*s > 0xFFFF) {
3179 ++res;
3180 }
3181 }
3182 }
3183 #endif
3184 return res;
3185 }
3186
3187 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3188 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3189 {
3190 assert(unicode != NULL);
3191 assert(_PyUnicode_CHECK(unicode));
3192
3193 #if USE_UNICODE_WCHAR_CACHE
3194 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
3195 if (wstr != NULL) {
3196 memcpy(w, wstr, size * sizeof(wchar_t));
3197 return;
3198 }
3199 #else /* USE_UNICODE_WCHAR_CACHE */
3200 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3201 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3202 return;
3203 }
3204 #endif /* USE_UNICODE_WCHAR_CACHE */
3205 assert(PyUnicode_IS_READY(unicode));
3206
3207 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3208 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3209 for (; size--; ++s, ++w) {
3210 *w = *s;
3211 }
3212 }
3213 else {
3214 #if SIZEOF_WCHAR_T == 4
3215 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3216 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3217 for (; size--; ++s, ++w) {
3218 *w = *s;
3219 }
3220 #else
3221 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3222 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3223 for (; size--; ++s, ++w) {
3224 Py_UCS4 ch = *s;
3225 if (ch > 0xFFFF) {
3226 assert(ch <= MAX_UNICODE);
3227 /* encode surrogate pair in this case */
3228 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3229 if (!size--)
3230 break;
3231 *w = Py_UNICODE_LOW_SURROGATE(ch);
3232 }
3233 else {
3234 *w = ch;
3235 }
3236 }
3237 #endif
3238 }
3239 }
3240
3241 #ifdef HAVE_WCHAR_H
3242
3243 /* Convert a Unicode object to a wide character string.
3244
3245 - If w is NULL: return the number of wide characters (including the null
3246 character) required to convert the unicode object. Ignore size argument.
3247
3248 - Otherwise: return the number of wide characters (excluding the null
3249 character) written into w. Write at most size wide characters (including
3250 the null character). */
3251 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3252 PyUnicode_AsWideChar(PyObject *unicode,
3253 wchar_t *w,
3254 Py_ssize_t size)
3255 {
3256 Py_ssize_t res;
3257
3258 if (unicode == NULL) {
3259 PyErr_BadInternalCall();
3260 return -1;
3261 }
3262 if (!PyUnicode_Check(unicode)) {
3263 PyErr_BadArgument();
3264 return -1;
3265 }
3266
3267 res = unicode_get_widechar_size(unicode);
3268 if (w == NULL) {
3269 return res + 1;
3270 }
3271
3272 if (size > res) {
3273 size = res + 1;
3274 }
3275 else {
3276 res = size;
3277 }
3278 unicode_copy_as_widechar(unicode, w, size);
3279
3280 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3281 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3282 non-Unicode locales and hence needs conversion first. */
3283 if (_Py_LocaleUsesNonUnicodeWchar()) {
3284 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3285 return -1;
3286 }
3287 }
3288 #endif
3289
3290 return res;
3291 }
3292
3293 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3294 PyUnicode_AsWideCharString(PyObject *unicode,
3295 Py_ssize_t *size)
3296 {
3297 wchar_t *buffer;
3298 Py_ssize_t buflen;
3299
3300 if (unicode == NULL) {
3301 PyErr_BadInternalCall();
3302 return NULL;
3303 }
3304 if (!PyUnicode_Check(unicode)) {
3305 PyErr_BadArgument();
3306 return NULL;
3307 }
3308
3309 buflen = unicode_get_widechar_size(unicode);
3310 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3311 if (buffer == NULL) {
3312 PyErr_NoMemory();
3313 return NULL;
3314 }
3315 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3316
3317 #ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319 non-Unicode locales and hence needs conversion first. */
3320 if (_Py_LocaleUsesNonUnicodeWchar()) {
3321 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3322 return NULL;
3323 }
3324 }
3325 #endif
3326
3327 if (size != NULL) {
3328 *size = buflen;
3329 }
3330 else if (wcslen(buffer) != (size_t)buflen) {
3331 PyMem_Free(buffer);
3332 PyErr_SetString(PyExc_ValueError,
3333 "embedded null character");
3334 return NULL;
3335 }
3336 return buffer;
3337 }
3338
3339 #endif /* HAVE_WCHAR_H */
3340
3341 int
_PyUnicode_WideCharString_Converter(PyObject * obj,void * ptr)3342 _PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3343 {
3344 wchar_t **p = (wchar_t **)ptr;
3345 if (obj == NULL) {
3346 #if !USE_UNICODE_WCHAR_CACHE
3347 PyMem_Free(*p);
3348 #endif /* USE_UNICODE_WCHAR_CACHE */
3349 *p = NULL;
3350 return 1;
3351 }
3352 if (PyUnicode_Check(obj)) {
3353 #if USE_UNICODE_WCHAR_CACHE
3354 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3355 if (*p == NULL) {
3356 return 0;
3357 }
3358 return 1;
3359 #else /* USE_UNICODE_WCHAR_CACHE */
3360 *p = PyUnicode_AsWideCharString(obj, NULL);
3361 if (*p == NULL) {
3362 return 0;
3363 }
3364 return Py_CLEANUP_SUPPORTED;
3365 #endif /* USE_UNICODE_WCHAR_CACHE */
3366 }
3367 PyErr_Format(PyExc_TypeError,
3368 "argument must be str, not %.50s",
3369 Py_TYPE(obj)->tp_name);
3370 return 0;
3371 }
3372
3373 int
_PyUnicode_WideCharString_Opt_Converter(PyObject * obj,void * ptr)3374 _PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3375 {
3376 wchar_t **p = (wchar_t **)ptr;
3377 if (obj == NULL) {
3378 #if !USE_UNICODE_WCHAR_CACHE
3379 PyMem_Free(*p);
3380 #endif /* USE_UNICODE_WCHAR_CACHE */
3381 *p = NULL;
3382 return 1;
3383 }
3384 if (obj == Py_None) {
3385 *p = NULL;
3386 return 1;
3387 }
3388 if (PyUnicode_Check(obj)) {
3389 #if USE_UNICODE_WCHAR_CACHE
3390 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3391 if (*p == NULL) {
3392 return 0;
3393 }
3394 return 1;
3395 #else /* USE_UNICODE_WCHAR_CACHE */
3396 *p = PyUnicode_AsWideCharString(obj, NULL);
3397 if (*p == NULL) {
3398 return 0;
3399 }
3400 return Py_CLEANUP_SUPPORTED;
3401 #endif /* USE_UNICODE_WCHAR_CACHE */
3402 }
3403 PyErr_Format(PyExc_TypeError,
3404 "argument must be str or None, not %.50s",
3405 Py_TYPE(obj)->tp_name);
3406 return 0;
3407 }
3408
3409 PyObject *
PyUnicode_FromOrdinal(int ordinal)3410 PyUnicode_FromOrdinal(int ordinal)
3411 {
3412 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3413 PyErr_SetString(PyExc_ValueError,
3414 "chr() arg not in range(0x110000)");
3415 return NULL;
3416 }
3417
3418 return unicode_char((Py_UCS4)ordinal);
3419 }
3420
3421 PyObject *
PyUnicode_FromObject(PyObject * obj)3422 PyUnicode_FromObject(PyObject *obj)
3423 {
3424 /* XXX Perhaps we should make this API an alias of
3425 PyObject_Str() instead ?! */
3426 if (PyUnicode_CheckExact(obj)) {
3427 if (PyUnicode_READY(obj) == -1)
3428 return NULL;
3429 Py_INCREF(obj);
3430 return obj;
3431 }
3432 if (PyUnicode_Check(obj)) {
3433 /* For a Unicode subtype that's not a Unicode object,
3434 return a true Unicode object with the same data. */
3435 return _PyUnicode_Copy(obj);
3436 }
3437 PyErr_Format(PyExc_TypeError,
3438 "Can't convert '%.100s' object to str implicitly",
3439 Py_TYPE(obj)->tp_name);
3440 return NULL;
3441 }
3442
3443 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3444 PyUnicode_FromEncodedObject(PyObject *obj,
3445 const char *encoding,
3446 const char *errors)
3447 {
3448 Py_buffer buffer;
3449 PyObject *v;
3450
3451 if (obj == NULL) {
3452 PyErr_BadInternalCall();
3453 return NULL;
3454 }
3455
3456 /* Decoding bytes objects is the most common case and should be fast */
3457 if (PyBytes_Check(obj)) {
3458 if (PyBytes_GET_SIZE(obj) == 0) {
3459 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3460 return NULL;
3461 }
3462 _Py_RETURN_UNICODE_EMPTY();
3463 }
3464 return PyUnicode_Decode(
3465 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3466 encoding, errors);
3467 }
3468
3469 if (PyUnicode_Check(obj)) {
3470 PyErr_SetString(PyExc_TypeError,
3471 "decoding str is not supported");
3472 return NULL;
3473 }
3474
3475 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3476 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3477 PyErr_Format(PyExc_TypeError,
3478 "decoding to str: need a bytes-like object, %.80s found",
3479 Py_TYPE(obj)->tp_name);
3480 return NULL;
3481 }
3482
3483 if (buffer.len == 0) {
3484 PyBuffer_Release(&buffer);
3485 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3486 return NULL;
3487 }
3488 _Py_RETURN_UNICODE_EMPTY();
3489 }
3490
3491 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3492 PyBuffer_Release(&buffer);
3493 return v;
3494 }
3495
3496 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3497 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3498 longer than lower_len-1). */
3499 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3500 _Py_normalize_encoding(const char *encoding,
3501 char *lower,
3502 size_t lower_len)
3503 {
3504 const char *e;
3505 char *l;
3506 char *l_end;
3507 int punct;
3508
3509 assert(encoding != NULL);
3510
3511 e = encoding;
3512 l = lower;
3513 l_end = &lower[lower_len - 1];
3514 punct = 0;
3515 while (1) {
3516 char c = *e;
3517 if (c == 0) {
3518 break;
3519 }
3520
3521 if (Py_ISALNUM(c) || c == '.') {
3522 if (punct && l != lower) {
3523 if (l == l_end) {
3524 return 0;
3525 }
3526 *l++ = '_';
3527 }
3528 punct = 0;
3529
3530 if (l == l_end) {
3531 return 0;
3532 }
3533 *l++ = Py_TOLOWER(c);
3534 }
3535 else {
3536 punct = 1;
3537 }
3538
3539 e++;
3540 }
3541 *l = '\0';
3542 return 1;
3543 }
3544
3545 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3546 PyUnicode_Decode(const char *s,
3547 Py_ssize_t size,
3548 const char *encoding,
3549 const char *errors)
3550 {
3551 PyObject *buffer = NULL, *unicode;
3552 Py_buffer info;
3553 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3554
3555 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3556 return NULL;
3557 }
3558
3559 if (size == 0) {
3560 _Py_RETURN_UNICODE_EMPTY();
3561 }
3562
3563 if (encoding == NULL) {
3564 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3565 }
3566
3567 /* Shortcuts for common default encodings */
3568 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3569 char *lower = buflower;
3570
3571 /* Fast paths */
3572 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3573 lower += 3;
3574 if (*lower == '_') {
3575 /* Match "utf8" and "utf_8" */
3576 lower++;
3577 }
3578
3579 if (lower[0] == '8' && lower[1] == 0) {
3580 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3581 }
3582 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3583 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3584 }
3585 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3586 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3587 }
3588 }
3589 else {
3590 if (strcmp(lower, "ascii") == 0
3591 || strcmp(lower, "us_ascii") == 0) {
3592 return PyUnicode_DecodeASCII(s, size, errors);
3593 }
3594 #ifdef MS_WINDOWS
3595 else if (strcmp(lower, "mbcs") == 0) {
3596 return PyUnicode_DecodeMBCS(s, size, errors);
3597 }
3598 #endif
3599 else if (strcmp(lower, "latin1") == 0
3600 || strcmp(lower, "latin_1") == 0
3601 || strcmp(lower, "iso_8859_1") == 0
3602 || strcmp(lower, "iso8859_1") == 0) {
3603 return PyUnicode_DecodeLatin1(s, size, errors);
3604 }
3605 }
3606 }
3607
3608 /* Decode via the codec registry */
3609 buffer = NULL;
3610 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3611 goto onError;
3612 buffer = PyMemoryView_FromBuffer(&info);
3613 if (buffer == NULL)
3614 goto onError;
3615 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3616 if (unicode == NULL)
3617 goto onError;
3618 if (!PyUnicode_Check(unicode)) {
3619 PyErr_Format(PyExc_TypeError,
3620 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3621 "use codecs.decode() to decode to arbitrary types",
3622 encoding,
3623 Py_TYPE(unicode)->tp_name);
3624 Py_DECREF(unicode);
3625 goto onError;
3626 }
3627 Py_DECREF(buffer);
3628 return unicode_result(unicode);
3629
3630 onError:
3631 Py_XDECREF(buffer);
3632 return NULL;
3633 }
3634
3635 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3636 PyUnicode_AsDecodedObject(PyObject *unicode,
3637 const char *encoding,
3638 const char *errors)
3639 {
3640 if (!PyUnicode_Check(unicode)) {
3641 PyErr_BadArgument();
3642 return NULL;
3643 }
3644
3645 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3646 "PyUnicode_AsDecodedObject() is deprecated; "
3647 "use PyCodec_Decode() to decode from str", 1) < 0)
3648 return NULL;
3649
3650 if (encoding == NULL)
3651 encoding = PyUnicode_GetDefaultEncoding();
3652
3653 /* Decode via the codec registry */
3654 return PyCodec_Decode(unicode, encoding, errors);
3655 }
3656
3657 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3658 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3659 const char *encoding,
3660 const char *errors)
3661 {
3662 PyObject *v;
3663
3664 if (!PyUnicode_Check(unicode)) {
3665 PyErr_BadArgument();
3666 goto onError;
3667 }
3668
3669 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3670 "PyUnicode_AsDecodedUnicode() is deprecated; "
3671 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3672 return NULL;
3673
3674 if (encoding == NULL)
3675 encoding = PyUnicode_GetDefaultEncoding();
3676
3677 /* Decode via the codec registry */
3678 v = PyCodec_Decode(unicode, encoding, errors);
3679 if (v == NULL)
3680 goto onError;
3681 if (!PyUnicode_Check(v)) {
3682 PyErr_Format(PyExc_TypeError,
3683 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3684 "use codecs.decode() to decode to arbitrary types",
3685 encoding,
3686 Py_TYPE(unicode)->tp_name);
3687 Py_DECREF(v);
3688 goto onError;
3689 }
3690 return unicode_result(v);
3691
3692 onError:
3693 return NULL;
3694 }
3695
3696 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3697 PyUnicode_AsEncodedObject(PyObject *unicode,
3698 const char *encoding,
3699 const char *errors)
3700 {
3701 PyObject *v;
3702
3703 if (!PyUnicode_Check(unicode)) {
3704 PyErr_BadArgument();
3705 goto onError;
3706 }
3707
3708 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3709 "PyUnicode_AsEncodedObject() is deprecated; "
3710 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3711 "or PyCodec_Encode() for generic encoding", 1) < 0)
3712 return NULL;
3713
3714 if (encoding == NULL)
3715 encoding = PyUnicode_GetDefaultEncoding();
3716
3717 /* Encode via the codec registry */
3718 v = PyCodec_Encode(unicode, encoding, errors);
3719 if (v == NULL)
3720 goto onError;
3721 return v;
3722
3723 onError:
3724 return NULL;
3725 }
3726
3727
3728 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3729 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3730 int current_locale)
3731 {
3732 Py_ssize_t wlen;
3733 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3734 if (wstr == NULL) {
3735 return NULL;
3736 }
3737
3738 if ((size_t)wlen != wcslen(wstr)) {
3739 PyErr_SetString(PyExc_ValueError, "embedded null character");
3740 PyMem_Free(wstr);
3741 return NULL;
3742 }
3743
3744 char *str;
3745 size_t error_pos;
3746 const char *reason;
3747 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3748 current_locale, error_handler);
3749 PyMem_Free(wstr);
3750
3751 if (res != 0) {
3752 if (res == -2) {
3753 PyObject *exc;
3754 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3755 "locale", unicode,
3756 (Py_ssize_t)error_pos,
3757 (Py_ssize_t)(error_pos+1),
3758 reason);
3759 if (exc != NULL) {
3760 PyCodec_StrictErrors(exc);
3761 Py_DECREF(exc);
3762 }
3763 }
3764 else if (res == -3) {
3765 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3766 }
3767 else {
3768 PyErr_NoMemory();
3769 }
3770 return NULL;
3771 }
3772
3773 PyObject *bytes = PyBytes_FromString(str);
3774 PyMem_RawFree(str);
3775 return bytes;
3776 }
3777
3778 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3779 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3780 {
3781 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3782 return unicode_encode_locale(unicode, error_handler, 1);
3783 }
3784
3785 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3786 PyUnicode_EncodeFSDefault(PyObject *unicode)
3787 {
3788 PyInterpreterState *interp = _PyInterpreterState_GET();
3789 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3790 if (fs_codec->utf8) {
3791 return unicode_encode_utf8(unicode,
3792 fs_codec->error_handler,
3793 fs_codec->errors);
3794 }
3795 #ifndef _Py_FORCE_UTF8_FS_ENCODING
3796 else if (fs_codec->encoding) {
3797 return PyUnicode_AsEncodedString(unicode,
3798 fs_codec->encoding,
3799 fs_codec->errors);
3800 }
3801 #endif
3802 else {
3803 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3804 machinery is not ready and so cannot be used:
3805 use wcstombs() in this case. */
3806 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3807 const wchar_t *filesystem_errors = config->filesystem_errors;
3808 assert(filesystem_errors != NULL);
3809 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3810 assert(errors != _Py_ERROR_UNKNOWN);
3811 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3812 return unicode_encode_utf8(unicode, errors, NULL);
3813 #else
3814 return unicode_encode_locale(unicode, errors, 0);
3815 #endif
3816 }
3817 }
3818
3819 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3820 PyUnicode_AsEncodedString(PyObject *unicode,
3821 const char *encoding,
3822 const char *errors)
3823 {
3824 PyObject *v;
3825 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3826
3827 if (!PyUnicode_Check(unicode)) {
3828 PyErr_BadArgument();
3829 return NULL;
3830 }
3831
3832 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3833 return NULL;
3834 }
3835
3836 if (encoding == NULL) {
3837 return _PyUnicode_AsUTF8String(unicode, errors);
3838 }
3839
3840 /* Shortcuts for common default encodings */
3841 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3842 char *lower = buflower;
3843
3844 /* Fast paths */
3845 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3846 lower += 3;
3847 if (*lower == '_') {
3848 /* Match "utf8" and "utf_8" */
3849 lower++;
3850 }
3851
3852 if (lower[0] == '8' && lower[1] == 0) {
3853 return _PyUnicode_AsUTF8String(unicode, errors);
3854 }
3855 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3856 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3857 }
3858 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3859 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3860 }
3861 }
3862 else {
3863 if (strcmp(lower, "ascii") == 0
3864 || strcmp(lower, "us_ascii") == 0) {
3865 return _PyUnicode_AsASCIIString(unicode, errors);
3866 }
3867 #ifdef MS_WINDOWS
3868 else if (strcmp(lower, "mbcs") == 0) {
3869 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3870 }
3871 #endif
3872 else if (strcmp(lower, "latin1") == 0 ||
3873 strcmp(lower, "latin_1") == 0 ||
3874 strcmp(lower, "iso_8859_1") == 0 ||
3875 strcmp(lower, "iso8859_1") == 0) {
3876 return _PyUnicode_AsLatin1String(unicode, errors);
3877 }
3878 }
3879 }
3880
3881 /* Encode via the codec registry */
3882 v = _PyCodec_EncodeText(unicode, encoding, errors);
3883 if (v == NULL)
3884 return NULL;
3885
3886 /* The normal path */
3887 if (PyBytes_Check(v))
3888 return v;
3889
3890 /* If the codec returns a buffer, raise a warning and convert to bytes */
3891 if (PyByteArray_Check(v)) {
3892 int error;
3893 PyObject *b;
3894
3895 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3896 "encoder %s returned bytearray instead of bytes; "
3897 "use codecs.encode() to encode to arbitrary types",
3898 encoding);
3899 if (error) {
3900 Py_DECREF(v);
3901 return NULL;
3902 }
3903
3904 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3905 PyByteArray_GET_SIZE(v));
3906 Py_DECREF(v);
3907 return b;
3908 }
3909
3910 PyErr_Format(PyExc_TypeError,
3911 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3912 "use codecs.encode() to encode to arbitrary types",
3913 encoding,
3914 Py_TYPE(v)->tp_name);
3915 Py_DECREF(v);
3916 return NULL;
3917 }
3918
3919 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3920 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3921 const char *encoding,
3922 const char *errors)
3923 {
3924 PyObject *v;
3925
3926 if (!PyUnicode_Check(unicode)) {
3927 PyErr_BadArgument();
3928 goto onError;
3929 }
3930
3931 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3932 "PyUnicode_AsEncodedUnicode() is deprecated; "
3933 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3934 return NULL;
3935
3936 if (encoding == NULL)
3937 encoding = PyUnicode_GetDefaultEncoding();
3938
3939 /* Encode via the codec registry */
3940 v = PyCodec_Encode(unicode, encoding, errors);
3941 if (v == NULL)
3942 goto onError;
3943 if (!PyUnicode_Check(v)) {
3944 PyErr_Format(PyExc_TypeError,
3945 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3946 "use codecs.encode() to encode to arbitrary types",
3947 encoding,
3948 Py_TYPE(v)->tp_name);
3949 Py_DECREF(v);
3950 goto onError;
3951 }
3952 return v;
3953
3954 onError:
3955 return NULL;
3956 }
3957
3958 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3959 unicode_decode_locale(const char *str, Py_ssize_t len,
3960 _Py_error_handler errors, int current_locale)
3961 {
3962 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3963 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3964 return NULL;
3965 }
3966
3967 wchar_t *wstr;
3968 size_t wlen;
3969 const char *reason;
3970 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3971 current_locale, errors);
3972 if (res != 0) {
3973 if (res == -2) {
3974 PyObject *exc;
3975 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3976 "locale", str, len,
3977 (Py_ssize_t)wlen,
3978 (Py_ssize_t)(wlen + 1),
3979 reason);
3980 if (exc != NULL) {
3981 PyCodec_StrictErrors(exc);
3982 Py_DECREF(exc);
3983 }
3984 }
3985 else if (res == -3) {
3986 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3987 }
3988 else {
3989 PyErr_NoMemory();
3990 }
3991 return NULL;
3992 }
3993
3994 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3995 PyMem_RawFree(wstr);
3996 return unicode;
3997 }
3998
3999 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)4000 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4001 const char *errors)
4002 {
4003 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4004 return unicode_decode_locale(str, len, error_handler, 1);
4005 }
4006
4007 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)4008 PyUnicode_DecodeLocale(const char *str, const char *errors)
4009 {
4010 Py_ssize_t size = (Py_ssize_t)strlen(str);
4011 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4012 return unicode_decode_locale(str, size, error_handler, 1);
4013 }
4014
4015
4016 PyObject*
PyUnicode_DecodeFSDefault(const char * s)4017 PyUnicode_DecodeFSDefault(const char *s) {
4018 Py_ssize_t size = (Py_ssize_t)strlen(s);
4019 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4020 }
4021
4022 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)4023 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4024 {
4025 PyInterpreterState *interp = _PyInterpreterState_GET();
4026 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4027 if (fs_codec->utf8) {
4028 return unicode_decode_utf8(s, size,
4029 fs_codec->error_handler,
4030 fs_codec->errors,
4031 NULL);
4032 }
4033 #ifndef _Py_FORCE_UTF8_FS_ENCODING
4034 else if (fs_codec->encoding) {
4035 return PyUnicode_Decode(s, size,
4036 fs_codec->encoding,
4037 fs_codec->errors);
4038 }
4039 #endif
4040 else {
4041 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4042 machinery is not ready and so cannot be used:
4043 use mbstowcs() in this case. */
4044 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4045 const wchar_t *filesystem_errors = config->filesystem_errors;
4046 assert(filesystem_errors != NULL);
4047 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4048 assert(errors != _Py_ERROR_UNKNOWN);
4049 #ifdef _Py_FORCE_UTF8_FS_ENCODING
4050 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4051 #else
4052 return unicode_decode_locale(s, size, errors, 0);
4053 #endif
4054 }
4055 }
4056
4057
4058 int
PyUnicode_FSConverter(PyObject * arg,void * addr)4059 PyUnicode_FSConverter(PyObject* arg, void* addr)
4060 {
4061 PyObject *path = NULL;
4062 PyObject *output = NULL;
4063 Py_ssize_t size;
4064 const char *data;
4065 if (arg == NULL) {
4066 Py_DECREF(*(PyObject**)addr);
4067 *(PyObject**)addr = NULL;
4068 return 1;
4069 }
4070 path = PyOS_FSPath(arg);
4071 if (path == NULL) {
4072 return 0;
4073 }
4074 if (PyBytes_Check(path)) {
4075 output = path;
4076 }
4077 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4078 output = PyUnicode_EncodeFSDefault(path);
4079 Py_DECREF(path);
4080 if (!output) {
4081 return 0;
4082 }
4083 assert(PyBytes_Check(output));
4084 }
4085
4086 size = PyBytes_GET_SIZE(output);
4087 data = PyBytes_AS_STRING(output);
4088 if ((size_t)size != strlen(data)) {
4089 PyErr_SetString(PyExc_ValueError, "embedded null byte");
4090 Py_DECREF(output);
4091 return 0;
4092 }
4093 *(PyObject**)addr = output;
4094 return Py_CLEANUP_SUPPORTED;
4095 }
4096
4097
4098 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)4099 PyUnicode_FSDecoder(PyObject* arg, void* addr)
4100 {
4101 int is_buffer = 0;
4102 PyObject *path = NULL;
4103 PyObject *output = NULL;
4104 if (arg == NULL) {
4105 Py_DECREF(*(PyObject**)addr);
4106 *(PyObject**)addr = NULL;
4107 return 1;
4108 }
4109
4110 is_buffer = PyObject_CheckBuffer(arg);
4111 if (!is_buffer) {
4112 path = PyOS_FSPath(arg);
4113 if (path == NULL) {
4114 return 0;
4115 }
4116 }
4117 else {
4118 path = arg;
4119 Py_INCREF(arg);
4120 }
4121
4122 if (PyUnicode_Check(path)) {
4123 output = path;
4124 }
4125 else if (PyBytes_Check(path) || is_buffer) {
4126 PyObject *path_bytes = NULL;
4127
4128 if (!PyBytes_Check(path) &&
4129 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
4130 "path should be string, bytes, or os.PathLike, not %.200s",
4131 Py_TYPE(arg)->tp_name)) {
4132 Py_DECREF(path);
4133 return 0;
4134 }
4135 path_bytes = PyBytes_FromObject(path);
4136 Py_DECREF(path);
4137 if (!path_bytes) {
4138 return 0;
4139 }
4140 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4141 PyBytes_GET_SIZE(path_bytes));
4142 Py_DECREF(path_bytes);
4143 if (!output) {
4144 return 0;
4145 }
4146 }
4147 else {
4148 PyErr_Format(PyExc_TypeError,
4149 "path should be string, bytes, or os.PathLike, not %.200s",
4150 Py_TYPE(arg)->tp_name);
4151 Py_DECREF(path);
4152 return 0;
4153 }
4154 if (PyUnicode_READY(output) == -1) {
4155 Py_DECREF(output);
4156 return 0;
4157 }
4158 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4159 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4160 PyErr_SetString(PyExc_ValueError, "embedded null character");
4161 Py_DECREF(output);
4162 return 0;
4163 }
4164 *(PyObject**)addr = output;
4165 return Py_CLEANUP_SUPPORTED;
4166 }
4167
4168
4169 static int unicode_fill_utf8(PyObject *unicode);
4170
4171 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)4172 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4173 {
4174 if (!PyUnicode_Check(unicode)) {
4175 PyErr_BadArgument();
4176 return NULL;
4177 }
4178 if (PyUnicode_READY(unicode) == -1)
4179 return NULL;
4180
4181 if (PyUnicode_UTF8(unicode) == NULL) {
4182 if (unicode_fill_utf8(unicode) == -1) {
4183 return NULL;
4184 }
4185 }
4186
4187 if (psize)
4188 *psize = PyUnicode_UTF8_LENGTH(unicode);
4189 return PyUnicode_UTF8(unicode);
4190 }
4191
4192 const char *
PyUnicode_AsUTF8(PyObject * unicode)4193 PyUnicode_AsUTF8(PyObject *unicode)
4194 {
4195 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4196 }
4197
4198 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)4199 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4200 {
4201 if (!PyUnicode_Check(unicode)) {
4202 PyErr_BadArgument();
4203 return NULL;
4204 }
4205 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4206 if (w == NULL) {
4207 /* Non-ASCII compact unicode object */
4208 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
4209 assert(PyUnicode_IS_READY(unicode));
4210
4211 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4212 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4213 PyErr_NoMemory();
4214 return NULL;
4215 }
4216 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
4217 if (w == NULL) {
4218 PyErr_NoMemory();
4219 return NULL;
4220 }
4221 unicode_copy_as_widechar(unicode, w, wlen + 1);
4222 _PyUnicode_WSTR(unicode) = w;
4223 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4224 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4225 }
4226 }
4227 if (size != NULL)
4228 *size = PyUnicode_WSTR_LENGTH(unicode);
4229 return w;
4230 }
4231
4232 /* Deprecated APIs */
4233
4234 _Py_COMP_DIAG_PUSH
4235 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4236
4237 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4238 PyUnicode_AsUnicode(PyObject *unicode)
4239 {
4240 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4241 }
4242
4243 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4244 _PyUnicode_AsUnicode(PyObject *unicode)
4245 {
4246 Py_ssize_t size;
4247 const Py_UNICODE *wstr;
4248
4249 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4250 if (wstr && wcslen(wstr) != (size_t)size) {
4251 PyErr_SetString(PyExc_ValueError, "embedded null character");
4252 return NULL;
4253 }
4254 return wstr;
4255 }
4256
4257
4258 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4259 PyUnicode_GetSize(PyObject *unicode)
4260 {
4261 if (!PyUnicode_Check(unicode)) {
4262 PyErr_BadArgument();
4263 goto onError;
4264 }
4265 if (_PyUnicode_WSTR(unicode) == NULL) {
4266 if (PyUnicode_AsUnicode(unicode) == NULL)
4267 goto onError;
4268 }
4269 return PyUnicode_WSTR_LENGTH(unicode);
4270
4271 onError:
4272 return -1;
4273 }
4274
4275 _Py_COMP_DIAG_POP
4276
4277 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4278 PyUnicode_GetLength(PyObject *unicode)
4279 {
4280 if (!PyUnicode_Check(unicode)) {
4281 PyErr_BadArgument();
4282 return -1;
4283 }
4284 if (PyUnicode_READY(unicode) == -1)
4285 return -1;
4286 return PyUnicode_GET_LENGTH(unicode);
4287 }
4288
4289 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4290 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4291 {
4292 const void *data;
4293 int kind;
4294
4295 if (!PyUnicode_Check(unicode)) {
4296 PyErr_BadArgument();
4297 return (Py_UCS4)-1;
4298 }
4299 if (PyUnicode_READY(unicode) == -1) {
4300 return (Py_UCS4)-1;
4301 }
4302 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4303 PyErr_SetString(PyExc_IndexError, "string index out of range");
4304 return (Py_UCS4)-1;
4305 }
4306 data = PyUnicode_DATA(unicode);
4307 kind = PyUnicode_KIND(unicode);
4308 return PyUnicode_READ(kind, data, index);
4309 }
4310
4311 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4312 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4313 {
4314 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4315 PyErr_BadArgument();
4316 return -1;
4317 }
4318 assert(PyUnicode_IS_READY(unicode));
4319 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4320 PyErr_SetString(PyExc_IndexError, "string index out of range");
4321 return -1;
4322 }
4323 if (unicode_check_modifiable(unicode))
4324 return -1;
4325 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4326 PyErr_SetString(PyExc_ValueError, "character out of range");
4327 return -1;
4328 }
4329 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4330 index, ch);
4331 return 0;
4332 }
4333
4334 const char *
PyUnicode_GetDefaultEncoding(void)4335 PyUnicode_GetDefaultEncoding(void)
4336 {
4337 return "utf-8";
4338 }
4339
4340 /* create or adjust a UnicodeDecodeError */
4341 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4342 make_decode_exception(PyObject **exceptionObject,
4343 const char *encoding,
4344 const char *input, Py_ssize_t length,
4345 Py_ssize_t startpos, Py_ssize_t endpos,
4346 const char *reason)
4347 {
4348 if (*exceptionObject == NULL) {
4349 *exceptionObject = PyUnicodeDecodeError_Create(
4350 encoding, input, length, startpos, endpos, reason);
4351 }
4352 else {
4353 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4354 goto onError;
4355 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4356 goto onError;
4357 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4358 goto onError;
4359 }
4360 return;
4361
4362 onError:
4363 Py_CLEAR(*exceptionObject);
4364 }
4365
4366 #ifdef MS_WINDOWS
4367 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4368 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4369 {
4370 if (newsize > *size) {
4371 wchar_t *newbuf = *buf;
4372 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4373 PyErr_NoMemory();
4374 return -1;
4375 }
4376 *buf = newbuf;
4377 }
4378 *size = newsize;
4379 return 0;
4380 }
4381
4382 /* error handling callback helper:
4383 build arguments, call the callback and check the arguments,
4384 if no exception occurred, copy the replacement to the output
4385 and adjust various state variables.
4386 return 0 on success, -1 on error
4387 */
4388
4389 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4390 unicode_decode_call_errorhandler_wchar(
4391 const char *errors, PyObject **errorHandler,
4392 const char *encoding, const char *reason,
4393 const char **input, const char **inend, Py_ssize_t *startinpos,
4394 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4395 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4396 {
4397 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4398
4399 PyObject *restuple = NULL;
4400 PyObject *repunicode = NULL;
4401 Py_ssize_t outsize;
4402 Py_ssize_t insize;
4403 Py_ssize_t requiredsize;
4404 Py_ssize_t newpos;
4405 PyObject *inputobj = NULL;
4406 Py_ssize_t repwlen;
4407
4408 if (*errorHandler == NULL) {
4409 *errorHandler = PyCodec_LookupError(errors);
4410 if (*errorHandler == NULL)
4411 goto onError;
4412 }
4413
4414 make_decode_exception(exceptionObject,
4415 encoding,
4416 *input, *inend - *input,
4417 *startinpos, *endinpos,
4418 reason);
4419 if (*exceptionObject == NULL)
4420 goto onError;
4421
4422 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4423 if (restuple == NULL)
4424 goto onError;
4425 if (!PyTuple_Check(restuple)) {
4426 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4427 goto onError;
4428 }
4429 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4430 goto onError;
4431
4432 /* Copy back the bytes variables, which might have been modified by the
4433 callback */
4434 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4435 if (!inputobj)
4436 goto onError;
4437 *input = PyBytes_AS_STRING(inputobj);
4438 insize = PyBytes_GET_SIZE(inputobj);
4439 *inend = *input + insize;
4440 /* we can DECREF safely, as the exception has another reference,
4441 so the object won't go away. */
4442 Py_DECREF(inputobj);
4443
4444 if (newpos<0)
4445 newpos = insize+newpos;
4446 if (newpos<0 || newpos>insize) {
4447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4448 goto onError;
4449 }
4450
4451 #if USE_UNICODE_WCHAR_CACHE
4452 _Py_COMP_DIAG_PUSH
4453 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
4454 repwlen = PyUnicode_GetSize(repunicode);
4455 if (repwlen < 0)
4456 goto onError;
4457 _Py_COMP_DIAG_POP
4458 #else /* USE_UNICODE_WCHAR_CACHE */
4459 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4460 if (repwlen < 0)
4461 goto onError;
4462 repwlen--;
4463 #endif /* USE_UNICODE_WCHAR_CACHE */
4464 /* need more space? (at least enough for what we
4465 have+the replacement+the rest of the string (starting
4466 at the new input position), so we won't have to check space
4467 when there are no errors in the rest of the string) */
4468 requiredsize = *outpos;
4469 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4470 goto overflow;
4471 requiredsize += repwlen;
4472 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4473 goto overflow;
4474 requiredsize += insize - newpos;
4475 outsize = *bufsize;
4476 if (requiredsize > outsize) {
4477 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4478 requiredsize = 2*outsize;
4479 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4480 goto onError;
4481 }
4482 }
4483 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4484 *outpos += repwlen;
4485 *endinpos = newpos;
4486 *inptr = *input + newpos;
4487
4488 /* we made it! */
4489 Py_DECREF(restuple);
4490 return 0;
4491
4492 overflow:
4493 PyErr_SetString(PyExc_OverflowError,
4494 "decoded result is too long for a Python string");
4495
4496 onError:
4497 Py_XDECREF(restuple);
4498 return -1;
4499 }
4500 #endif /* MS_WINDOWS */
4501
4502 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4503 unicode_decode_call_errorhandler_writer(
4504 const char *errors, PyObject **errorHandler,
4505 const char *encoding, const char *reason,
4506 const char **input, const char **inend, Py_ssize_t *startinpos,
4507 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4508 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4509 {
4510 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4511
4512 PyObject *restuple = NULL;
4513 PyObject *repunicode = NULL;
4514 Py_ssize_t insize;
4515 Py_ssize_t newpos;
4516 Py_ssize_t replen;
4517 Py_ssize_t remain;
4518 PyObject *inputobj = NULL;
4519 int need_to_grow = 0;
4520 const char *new_inptr;
4521
4522 if (*errorHandler == NULL) {
4523 *errorHandler = PyCodec_LookupError(errors);
4524 if (*errorHandler == NULL)
4525 goto onError;
4526 }
4527
4528 make_decode_exception(exceptionObject,
4529 encoding,
4530 *input, *inend - *input,
4531 *startinpos, *endinpos,
4532 reason);
4533 if (*exceptionObject == NULL)
4534 goto onError;
4535
4536 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4537 if (restuple == NULL)
4538 goto onError;
4539 if (!PyTuple_Check(restuple)) {
4540 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4541 goto onError;
4542 }
4543 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4544 goto onError;
4545
4546 /* Copy back the bytes variables, which might have been modified by the
4547 callback */
4548 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4549 if (!inputobj)
4550 goto onError;
4551 remain = *inend - *input - *endinpos;
4552 *input = PyBytes_AS_STRING(inputobj);
4553 insize = PyBytes_GET_SIZE(inputobj);
4554 *inend = *input + insize;
4555 /* we can DECREF safely, as the exception has another reference,
4556 so the object won't go away. */
4557 Py_DECREF(inputobj);
4558
4559 if (newpos<0)
4560 newpos = insize+newpos;
4561 if (newpos<0 || newpos>insize) {
4562 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4563 goto onError;
4564 }
4565
4566 replen = PyUnicode_GET_LENGTH(repunicode);
4567 if (replen > 1) {
4568 writer->min_length += replen - 1;
4569 need_to_grow = 1;
4570 }
4571 new_inptr = *input + newpos;
4572 if (*inend - new_inptr > remain) {
4573 /* We don't know the decoding algorithm here so we make the worst
4574 assumption that one byte decodes to one unicode character.
4575 If unfortunately one byte could decode to more unicode characters,
4576 the decoder may write out-of-bound then. Is it possible for the
4577 algorithms using this function? */
4578 writer->min_length += *inend - new_inptr - remain;
4579 need_to_grow = 1;
4580 }
4581 if (need_to_grow) {
4582 writer->overallocate = 1;
4583 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4584 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4585 goto onError;
4586 }
4587 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4588 goto onError;
4589
4590 *endinpos = newpos;
4591 *inptr = new_inptr;
4592
4593 /* we made it! */
4594 Py_DECREF(restuple);
4595 return 0;
4596
4597 onError:
4598 Py_XDECREF(restuple);
4599 return -1;
4600 }
4601
4602 /* --- UTF-7 Codec -------------------------------------------------------- */
4603
4604 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4605
4606 /* Three simple macros defining base-64. */
4607
4608 /* Is c a base-64 character? */
4609
4610 #define IS_BASE64(c) \
4611 (((c) >= 'A' && (c) <= 'Z') || \
4612 ((c) >= 'a' && (c) <= 'z') || \
4613 ((c) >= '0' && (c) <= '9') || \
4614 (c) == '+' || (c) == '/')
4615
4616 /* given that c is a base-64 character, what is its base-64 value? */
4617
4618 #define FROM_BASE64(c) \
4619 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4620 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4621 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4622 (c) == '+' ? 62 : 63)
4623
4624 /* What is the base-64 character of the bottom 6 bits of n? */
4625
4626 #define TO_BASE64(n) \
4627 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4628
4629 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4630 * decoded as itself. We are permissive on decoding; the only ASCII
4631 * byte not decoding to itself is the + which begins a base64
4632 * string. */
4633
4634 #define DECODE_DIRECT(c) \
4635 ((c) <= 127 && (c) != '+')
4636
4637 /* The UTF-7 encoder treats ASCII characters differently according to
4638 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4639 * the above). See RFC2152. This array identifies these different
4640 * sets:
4641 * 0 : "Set D"
4642 * alphanumeric and '(),-./:?
4643 * 1 : "Set O"
4644 * !"#$%&*;<=>@[]^_`{|}
4645 * 2 : "whitespace"
4646 * ht nl cr sp
4647 * 3 : special (must be base64 encoded)
4648 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4649 */
4650
4651 static
4652 char utf7_category[128] = {
4653 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4654 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4655 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4656 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4657 /* sp ! " # $ % & ' ( ) * + , - . / */
4658 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4659 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4661 /* @ A B C D E F G H I J K L M N O */
4662 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4663 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4665 /* ` a b c d e f g h i j k l m n o */
4666 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4667 /* p q r s t u v w x y z { | } ~ del */
4668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4669 };
4670
4671 /* ENCODE_DIRECT: this character should be encoded as itself. The
4672 * answer depends on whether we are encoding set O as itself, and also
4673 * on whether we are encoding whitespace as itself. RFC2152 makes it
4674 * clear that the answers to these questions vary between
4675 * applications, so this code needs to be flexible. */
4676
4677 #define ENCODE_DIRECT(c, directO, directWS) \
4678 ((c) < 128 && (c) > 0 && \
4679 ((utf7_category[(c)] == 0) || \
4680 (directWS && (utf7_category[(c)] == 2)) || \
4681 (directO && (utf7_category[(c)] == 1))))
4682
4683 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4684 PyUnicode_DecodeUTF7(const char *s,
4685 Py_ssize_t size,
4686 const char *errors)
4687 {
4688 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4689 }
4690
4691 /* The decoder. The only state we preserve is our read position,
4692 * i.e. how many characters we have consumed. So if we end in the
4693 * middle of a shift sequence we have to back off the read position
4694 * and the output to the beginning of the sequence, otherwise we lose
4695 * all the shift state (seen bits, number of bits seen, high
4696 * surrogate). */
4697
4698 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4699 PyUnicode_DecodeUTF7Stateful(const char *s,
4700 Py_ssize_t size,
4701 const char *errors,
4702 Py_ssize_t *consumed)
4703 {
4704 const char *starts = s;
4705 Py_ssize_t startinpos;
4706 Py_ssize_t endinpos;
4707 const char *e;
4708 _PyUnicodeWriter writer;
4709 const char *errmsg = "";
4710 int inShift = 0;
4711 Py_ssize_t shiftOutStart;
4712 unsigned int base64bits = 0;
4713 unsigned long base64buffer = 0;
4714 Py_UCS4 surrogate = 0;
4715 PyObject *errorHandler = NULL;
4716 PyObject *exc = NULL;
4717
4718 if (size == 0) {
4719 if (consumed)
4720 *consumed = 0;
4721 _Py_RETURN_UNICODE_EMPTY();
4722 }
4723
4724 /* Start off assuming it's all ASCII. Widen later as necessary. */
4725 _PyUnicodeWriter_Init(&writer);
4726 writer.min_length = size;
4727
4728 shiftOutStart = 0;
4729 e = s + size;
4730
4731 while (s < e) {
4732 Py_UCS4 ch;
4733 restart:
4734 ch = (unsigned char) *s;
4735
4736 if (inShift) { /* in a base-64 section */
4737 if (IS_BASE64(ch)) { /* consume a base-64 character */
4738 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4739 base64bits += 6;
4740 s++;
4741 if (base64bits >= 16) {
4742 /* we have enough bits for a UTF-16 value */
4743 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4744 base64bits -= 16;
4745 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4746 assert(outCh <= 0xffff);
4747 if (surrogate) {
4748 /* expecting a second surrogate */
4749 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4750 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4751 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4752 goto onError;
4753 surrogate = 0;
4754 continue;
4755 }
4756 else {
4757 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4758 goto onError;
4759 surrogate = 0;
4760 }
4761 }
4762 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4763 /* first surrogate */
4764 surrogate = outCh;
4765 }
4766 else {
4767 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4768 goto onError;
4769 }
4770 }
4771 }
4772 else { /* now leaving a base-64 section */
4773 inShift = 0;
4774 if (base64bits > 0) { /* left-over bits */
4775 if (base64bits >= 6) {
4776 /* We've seen at least one base-64 character */
4777 s++;
4778 errmsg = "partial character in shift sequence";
4779 goto utf7Error;
4780 }
4781 else {
4782 /* Some bits remain; they should be zero */
4783 if (base64buffer != 0) {
4784 s++;
4785 errmsg = "non-zero padding bits in shift sequence";
4786 goto utf7Error;
4787 }
4788 }
4789 }
4790 if (surrogate && DECODE_DIRECT(ch)) {
4791 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4792 goto onError;
4793 }
4794 surrogate = 0;
4795 if (ch == '-') {
4796 /* '-' is absorbed; other terminating
4797 characters are preserved */
4798 s++;
4799 }
4800 }
4801 }
4802 else if ( ch == '+' ) {
4803 startinpos = s-starts;
4804 s++; /* consume '+' */
4805 if (s < e && *s == '-') { /* '+-' encodes '+' */
4806 s++;
4807 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4808 goto onError;
4809 }
4810 else if (s < e && !IS_BASE64(*s)) {
4811 s++;
4812 errmsg = "ill-formed sequence";
4813 goto utf7Error;
4814 }
4815 else { /* begin base64-encoded section */
4816 inShift = 1;
4817 surrogate = 0;
4818 shiftOutStart = writer.pos;
4819 base64bits = 0;
4820 base64buffer = 0;
4821 }
4822 }
4823 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4824 s++;
4825 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4826 goto onError;
4827 }
4828 else {
4829 startinpos = s-starts;
4830 s++;
4831 errmsg = "unexpected special character";
4832 goto utf7Error;
4833 }
4834 continue;
4835 utf7Error:
4836 endinpos = s-starts;
4837 if (unicode_decode_call_errorhandler_writer(
4838 errors, &errorHandler,
4839 "utf7", errmsg,
4840 &starts, &e, &startinpos, &endinpos, &exc, &s,
4841 &writer))
4842 goto onError;
4843 }
4844
4845 /* end of string */
4846
4847 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4848 /* if we're in an inconsistent state, that's an error */
4849 inShift = 0;
4850 if (surrogate ||
4851 (base64bits >= 6) ||
4852 (base64bits > 0 && base64buffer != 0)) {
4853 endinpos = size;
4854 if (unicode_decode_call_errorhandler_writer(
4855 errors, &errorHandler,
4856 "utf7", "unterminated shift sequence",
4857 &starts, &e, &startinpos, &endinpos, &exc, &s,
4858 &writer))
4859 goto onError;
4860 if (s < e)
4861 goto restart;
4862 }
4863 }
4864
4865 /* return state */
4866 if (consumed) {
4867 if (inShift) {
4868 *consumed = startinpos;
4869 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4870 PyObject *result = PyUnicode_FromKindAndData(
4871 writer.kind, writer.data, shiftOutStart);
4872 Py_XDECREF(errorHandler);
4873 Py_XDECREF(exc);
4874 _PyUnicodeWriter_Dealloc(&writer);
4875 return result;
4876 }
4877 writer.pos = shiftOutStart; /* back off output */
4878 }
4879 else {
4880 *consumed = s-starts;
4881 }
4882 }
4883
4884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 return _PyUnicodeWriter_Finish(&writer);
4887
4888 onError:
4889 Py_XDECREF(errorHandler);
4890 Py_XDECREF(exc);
4891 _PyUnicodeWriter_Dealloc(&writer);
4892 return NULL;
4893 }
4894
4895
4896 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4897 _PyUnicode_EncodeUTF7(PyObject *str,
4898 int base64SetO,
4899 int base64WhiteSpace,
4900 const char *errors)
4901 {
4902 int kind;
4903 const void *data;
4904 Py_ssize_t len;
4905 PyObject *v;
4906 int inShift = 0;
4907 Py_ssize_t i;
4908 unsigned int base64bits = 0;
4909 unsigned long base64buffer = 0;
4910 char * out;
4911 const char * start;
4912
4913 if (PyUnicode_READY(str) == -1)
4914 return NULL;
4915 kind = PyUnicode_KIND(str);
4916 data = PyUnicode_DATA(str);
4917 len = PyUnicode_GET_LENGTH(str);
4918
4919 if (len == 0)
4920 return PyBytes_FromStringAndSize(NULL, 0);
4921
4922 /* It might be possible to tighten this worst case */
4923 if (len > PY_SSIZE_T_MAX / 8)
4924 return PyErr_NoMemory();
4925 v = PyBytes_FromStringAndSize(NULL, len * 8);
4926 if (v == NULL)
4927 return NULL;
4928
4929 start = out = PyBytes_AS_STRING(v);
4930 for (i = 0; i < len; ++i) {
4931 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4932
4933 if (inShift) {
4934 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4935 /* shifting out */
4936 if (base64bits) { /* output remaining bits */
4937 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4938 base64buffer = 0;
4939 base64bits = 0;
4940 }
4941 inShift = 0;
4942 /* Characters not in the BASE64 set implicitly unshift the sequence
4943 so no '-' is required, except if the character is itself a '-' */
4944 if (IS_BASE64(ch) || ch == '-') {
4945 *out++ = '-';
4946 }
4947 *out++ = (char) ch;
4948 }
4949 else {
4950 goto encode_char;
4951 }
4952 }
4953 else { /* not in a shift sequence */
4954 if (ch == '+') {
4955 *out++ = '+';
4956 *out++ = '-';
4957 }
4958 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4959 *out++ = (char) ch;
4960 }
4961 else {
4962 *out++ = '+';
4963 inShift = 1;
4964 goto encode_char;
4965 }
4966 }
4967 continue;
4968 encode_char:
4969 if (ch >= 0x10000) {
4970 assert(ch <= MAX_UNICODE);
4971
4972 /* code first surrogate */
4973 base64bits += 16;
4974 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4975 while (base64bits >= 6) {
4976 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4977 base64bits -= 6;
4978 }
4979 /* prepare second surrogate */
4980 ch = Py_UNICODE_LOW_SURROGATE(ch);
4981 }
4982 base64bits += 16;
4983 base64buffer = (base64buffer << 16) | ch;
4984 while (base64bits >= 6) {
4985 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4986 base64bits -= 6;
4987 }
4988 }
4989 if (base64bits)
4990 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4991 if (inShift)
4992 *out++ = '-';
4993 if (_PyBytes_Resize(&v, out - start) < 0)
4994 return NULL;
4995 return v;
4996 }
4997
4998 #undef IS_BASE64
4999 #undef FROM_BASE64
5000 #undef TO_BASE64
5001 #undef DECODE_DIRECT
5002 #undef ENCODE_DIRECT
5003
5004 /* --- UTF-8 Codec -------------------------------------------------------- */
5005
5006 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)5007 PyUnicode_DecodeUTF8(const char *s,
5008 Py_ssize_t size,
5009 const char *errors)
5010 {
5011 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5012 }
5013
5014 #include "stringlib/asciilib.h"
5015 #include "stringlib/codecs.h"
5016 #include "stringlib/undef.h"
5017
5018 #include "stringlib/ucs1lib.h"
5019 #include "stringlib/codecs.h"
5020 #include "stringlib/undef.h"
5021
5022 #include "stringlib/ucs2lib.h"
5023 #include "stringlib/codecs.h"
5024 #include "stringlib/undef.h"
5025
5026 #include "stringlib/ucs4lib.h"
5027 #include "stringlib/codecs.h"
5028 #include "stringlib/undef.h"
5029
5030 /* Mask to quickly check whether a C 'size_t' contains a
5031 non-ASCII, UTF8-encoded char. */
5032 #if (SIZEOF_SIZE_T == 8)
5033 # define ASCII_CHAR_MASK 0x8080808080808080ULL
5034 #elif (SIZEOF_SIZE_T == 4)
5035 # define ASCII_CHAR_MASK 0x80808080U
5036 #else
5037 # error C 'size_t' size should be either 4 or 8!
5038 #endif
5039
5040 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)5041 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5042 {
5043 const char *p = start;
5044
5045 #if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5046 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5047 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5048 /* Fast path, see in STRINGLIB(utf8_decode) for
5049 an explanation. */
5050 /* Help allocation */
5051 const char *_p = p;
5052 Py_UCS1 * q = dest;
5053 while (_p + SIZEOF_SIZE_T <= end) {
5054 size_t value = *(const size_t *) _p;
5055 if (value & ASCII_CHAR_MASK)
5056 break;
5057 *((size_t *)q) = value;
5058 _p += SIZEOF_SIZE_T;
5059 q += SIZEOF_SIZE_T;
5060 }
5061 p = _p;
5062 while (p < end) {
5063 if ((unsigned char)*p & 0x80)
5064 break;
5065 *q++ = *p++;
5066 }
5067 return p - start;
5068 }
5069 #endif
5070 while (p < end) {
5071 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5072 for an explanation. */
5073 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
5074 /* Help allocation */
5075 const char *_p = p;
5076 while (_p + SIZEOF_SIZE_T <= end) {
5077 size_t value = *(const size_t *) _p;
5078 if (value & ASCII_CHAR_MASK)
5079 break;
5080 _p += SIZEOF_SIZE_T;
5081 }
5082 p = _p;
5083 if (_p == end)
5084 break;
5085 }
5086 if ((unsigned char)*p & 0x80)
5087 break;
5088 ++p;
5089 }
5090 memcpy(dest, start, p - start);
5091 return p - start;
5092 }
5093
5094 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)5095 unicode_decode_utf8(const char *s, Py_ssize_t size,
5096 _Py_error_handler error_handler, const char *errors,
5097 Py_ssize_t *consumed)
5098 {
5099 if (size == 0) {
5100 if (consumed)
5101 *consumed = 0;
5102 _Py_RETURN_UNICODE_EMPTY();
5103 }
5104
5105 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5106 if (size == 1 && (unsigned char)s[0] < 128) {
5107 if (consumed) {
5108 *consumed = 1;
5109 }
5110 return get_latin1_char((unsigned char)s[0]);
5111 }
5112
5113 const char *starts = s;
5114 const char *end = s + size;
5115
5116 // fast path: try ASCII string.
5117 PyObject *u = PyUnicode_New(size, 127);
5118 if (u == NULL) {
5119 return NULL;
5120 }
5121 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
5122 if (s == end) {
5123 return u;
5124 }
5125
5126 // Use _PyUnicodeWriter after fast path is failed.
5127 _PyUnicodeWriter writer;
5128 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5129 writer.pos = s - starts;
5130
5131 Py_ssize_t startinpos, endinpos;
5132 const char *errmsg = "";
5133 PyObject *error_handler_obj = NULL;
5134 PyObject *exc = NULL;
5135
5136 while (s < end) {
5137 Py_UCS4 ch;
5138 int kind = writer.kind;
5139
5140 if (kind == PyUnicode_1BYTE_KIND) {
5141 if (PyUnicode_IS_ASCII(writer.buffer))
5142 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
5143 else
5144 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
5145 } else if (kind == PyUnicode_2BYTE_KIND) {
5146 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
5147 } else {
5148 assert(kind == PyUnicode_4BYTE_KIND);
5149 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
5150 }
5151
5152 switch (ch) {
5153 case 0:
5154 if (s == end || consumed)
5155 goto End;
5156 errmsg = "unexpected end of data";
5157 startinpos = s - starts;
5158 endinpos = end - starts;
5159 break;
5160 case 1:
5161 errmsg = "invalid start byte";
5162 startinpos = s - starts;
5163 endinpos = startinpos + 1;
5164 break;
5165 case 2:
5166 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5167 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5168 {
5169 /* Truncated surrogate code in range D800-DFFF */
5170 goto End;
5171 }
5172 /* fall through */
5173 case 3:
5174 case 4:
5175 errmsg = "invalid continuation byte";
5176 startinpos = s - starts;
5177 endinpos = startinpos + ch - 1;
5178 break;
5179 default:
5180 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5181 goto onError;
5182 continue;
5183 }
5184
5185 if (error_handler == _Py_ERROR_UNKNOWN)
5186 error_handler = _Py_GetErrorHandler(errors);
5187
5188 switch (error_handler) {
5189 case _Py_ERROR_IGNORE:
5190 s += (endinpos - startinpos);
5191 break;
5192
5193 case _Py_ERROR_REPLACE:
5194 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5195 goto onError;
5196 s += (endinpos - startinpos);
5197 break;
5198
5199 case _Py_ERROR_SURROGATEESCAPE:
5200 {
5201 Py_ssize_t i;
5202
5203 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5204 goto onError;
5205 for (i=startinpos; i<endinpos; i++) {
5206 ch = (Py_UCS4)(unsigned char)(starts[i]);
5207 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5208 ch + 0xdc00);
5209 writer.pos++;
5210 }
5211 s += (endinpos - startinpos);
5212 break;
5213 }
5214
5215 default:
5216 if (unicode_decode_call_errorhandler_writer(
5217 errors, &error_handler_obj,
5218 "utf-8", errmsg,
5219 &starts, &end, &startinpos, &endinpos, &exc, &s,
5220 &writer))
5221 goto onError;
5222 }
5223 }
5224
5225 End:
5226 if (consumed)
5227 *consumed = s - starts;
5228
5229 Py_XDECREF(error_handler_obj);
5230 Py_XDECREF(exc);
5231 return _PyUnicodeWriter_Finish(&writer);
5232
5233 onError:
5234 Py_XDECREF(error_handler_obj);
5235 Py_XDECREF(exc);
5236 _PyUnicodeWriter_Dealloc(&writer);
5237 return NULL;
5238 }
5239
5240
5241 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5242 PyUnicode_DecodeUTF8Stateful(const char *s,
5243 Py_ssize_t size,
5244 const char *errors,
5245 Py_ssize_t *consumed)
5246 {
5247 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5248 }
5249
5250
5251 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5252 non-zero, use strict error handler otherwise.
5253
5254 On success, write a pointer to a newly allocated wide character string into
5255 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5256 (in number of wchar_t units) into *wlen (if wlen is set).
5257
5258 On memory allocation failure, return -1.
5259
5260 On decoding error (if surrogateescape is zero), return -2. If wlen is
5261 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5262 is not NULL, write the decoding error message into *reason. */
5263 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5264 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5265 const char **reason, _Py_error_handler errors)
5266 {
5267 const char *orig_s = s;
5268 const char *e;
5269 wchar_t *unicode;
5270 Py_ssize_t outpos;
5271
5272 int surrogateescape = 0;
5273 int surrogatepass = 0;
5274 switch (errors)
5275 {
5276 case _Py_ERROR_STRICT:
5277 break;
5278 case _Py_ERROR_SURROGATEESCAPE:
5279 surrogateescape = 1;
5280 break;
5281 case _Py_ERROR_SURROGATEPASS:
5282 surrogatepass = 1;
5283 break;
5284 default:
5285 return -3;
5286 }
5287
5288 /* Note: size will always be longer than the resulting Unicode
5289 character count */
5290 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5291 return -1;
5292 }
5293
5294 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5295 if (!unicode) {
5296 return -1;
5297 }
5298
5299 /* Unpack UTF-8 encoded data */
5300 e = s + size;
5301 outpos = 0;
5302 while (s < e) {
5303 Py_UCS4 ch;
5304 #if SIZEOF_WCHAR_T == 4
5305 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5306 #else
5307 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5308 #endif
5309 if (ch > 0xFF) {
5310 #if SIZEOF_WCHAR_T == 4
5311 Py_UNREACHABLE();
5312 #else
5313 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5314 /* write a surrogate pair */
5315 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5316 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5317 #endif
5318 }
5319 else {
5320 if (!ch && s == e) {
5321 break;
5322 }
5323
5324 if (surrogateescape) {
5325 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5326 }
5327 else {
5328 /* Is it a valid three-byte code? */
5329 if (surrogatepass
5330 && (e - s) >= 3
5331 && (s[0] & 0xf0) == 0xe0
5332 && (s[1] & 0xc0) == 0x80
5333 && (s[2] & 0xc0) == 0x80)
5334 {
5335 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5336 s += 3;
5337 unicode[outpos++] = ch;
5338 }
5339 else {
5340 PyMem_RawFree(unicode );
5341 if (reason != NULL) {
5342 switch (ch) {
5343 case 0:
5344 *reason = "unexpected end of data";
5345 break;
5346 case 1:
5347 *reason = "invalid start byte";
5348 break;
5349 /* 2, 3, 4 */
5350 default:
5351 *reason = "invalid continuation byte";
5352 break;
5353 }
5354 }
5355 if (wlen != NULL) {
5356 *wlen = s - orig_s;
5357 }
5358 return -2;
5359 }
5360 }
5361 }
5362 }
5363 unicode[outpos] = L'\0';
5364 if (wlen) {
5365 *wlen = outpos;
5366 }
5367 *wstr = unicode;
5368 return 0;
5369 }
5370
5371
5372 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5373 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5374 size_t *wlen)
5375 {
5376 wchar_t *wstr;
5377 int res = _Py_DecodeUTF8Ex(arg, arglen,
5378 &wstr, wlen,
5379 NULL, _Py_ERROR_SURROGATEESCAPE);
5380 if (res != 0) {
5381 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5382 assert(res != -3);
5383 if (wlen) {
5384 *wlen = (size_t)res;
5385 }
5386 return NULL;
5387 }
5388 return wstr;
5389 }
5390
5391
5392 /* UTF-8 encoder using the surrogateescape error handler .
5393
5394 On success, return 0 and write the newly allocated character string (use
5395 PyMem_Free() to free the memory) into *str.
5396
5397 On encoding failure, return -2 and write the position of the invalid
5398 surrogate character into *error_pos (if error_pos is set) and the decoding
5399 error message into *reason (if reason is set).
5400
5401 On memory allocation failure, return -1. */
5402 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5403 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5404 const char **reason, int raw_malloc, _Py_error_handler errors)
5405 {
5406 const Py_ssize_t max_char_size = 4;
5407 Py_ssize_t len = wcslen(text);
5408
5409 assert(len >= 0);
5410
5411 int surrogateescape = 0;
5412 int surrogatepass = 0;
5413 switch (errors)
5414 {
5415 case _Py_ERROR_STRICT:
5416 break;
5417 case _Py_ERROR_SURROGATEESCAPE:
5418 surrogateescape = 1;
5419 break;
5420 case _Py_ERROR_SURROGATEPASS:
5421 surrogatepass = 1;
5422 break;
5423 default:
5424 return -3;
5425 }
5426
5427 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5428 return -1;
5429 }
5430 char *bytes;
5431 if (raw_malloc) {
5432 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5433 }
5434 else {
5435 bytes = PyMem_Malloc((len + 1) * max_char_size);
5436 }
5437 if (bytes == NULL) {
5438 return -1;
5439 }
5440
5441 char *p = bytes;
5442 Py_ssize_t i;
5443 for (i = 0; i < len; ) {
5444 Py_ssize_t ch_pos = i;
5445 Py_UCS4 ch = text[i];
5446 i++;
5447 #if Py_UNICODE_SIZE == 2
5448 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5449 && i < len
5450 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5451 {
5452 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5453 i++;
5454 }
5455 #endif
5456
5457 if (ch < 0x80) {
5458 /* Encode ASCII */
5459 *p++ = (char) ch;
5460
5461 }
5462 else if (ch < 0x0800) {
5463 /* Encode Latin-1 */
5464 *p++ = (char)(0xc0 | (ch >> 6));
5465 *p++ = (char)(0x80 | (ch & 0x3f));
5466 }
5467 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5468 /* surrogateescape error handler */
5469 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5470 if (error_pos != NULL) {
5471 *error_pos = (size_t)ch_pos;
5472 }
5473 if (reason != NULL) {
5474 *reason = "encoding error";
5475 }
5476 if (raw_malloc) {
5477 PyMem_RawFree(bytes);
5478 }
5479 else {
5480 PyMem_Free(bytes);
5481 }
5482 return -2;
5483 }
5484 *p++ = (char)(ch & 0xff);
5485 }
5486 else if (ch < 0x10000) {
5487 *p++ = (char)(0xe0 | (ch >> 12));
5488 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5489 *p++ = (char)(0x80 | (ch & 0x3f));
5490 }
5491 else { /* ch >= 0x10000 */
5492 assert(ch <= MAX_UNICODE);
5493 /* Encode UCS4 Unicode ordinals */
5494 *p++ = (char)(0xf0 | (ch >> 18));
5495 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5496 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5497 *p++ = (char)(0x80 | (ch & 0x3f));
5498 }
5499 }
5500 *p++ = '\0';
5501
5502 size_t final_size = (p - bytes);
5503 char *bytes2;
5504 if (raw_malloc) {
5505 bytes2 = PyMem_RawRealloc(bytes, final_size);
5506 }
5507 else {
5508 bytes2 = PyMem_Realloc(bytes, final_size);
5509 }
5510 if (bytes2 == NULL) {
5511 if (error_pos != NULL) {
5512 *error_pos = (size_t)-1;
5513 }
5514 if (raw_malloc) {
5515 PyMem_RawFree(bytes);
5516 }
5517 else {
5518 PyMem_Free(bytes);
5519 }
5520 return -1;
5521 }
5522 *str = bytes2;
5523 return 0;
5524 }
5525
5526
5527 /* Primary internal function which creates utf8 encoded bytes objects.
5528
5529 Allocation strategy: if the string is short, convert into a stack buffer
5530 and allocate exactly as much space needed at the end. Else allocate the
5531 maximum possible needed (4 result bytes per Unicode character), and return
5532 the excess memory at the end.
5533 */
5534 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5535 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5536 const char *errors)
5537 {
5538 if (!PyUnicode_Check(unicode)) {
5539 PyErr_BadArgument();
5540 return NULL;
5541 }
5542
5543 if (PyUnicode_READY(unicode) == -1)
5544 return NULL;
5545
5546 if (PyUnicode_UTF8(unicode))
5547 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5548 PyUnicode_UTF8_LENGTH(unicode));
5549
5550 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5551 const void *data = PyUnicode_DATA(unicode);
5552 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5553
5554 _PyBytesWriter writer;
5555 char *end;
5556
5557 switch (kind) {
5558 default:
5559 Py_UNREACHABLE();
5560 case PyUnicode_1BYTE_KIND:
5561 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5562 assert(!PyUnicode_IS_ASCII(unicode));
5563 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5564 break;
5565 case PyUnicode_2BYTE_KIND:
5566 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5567 break;
5568 case PyUnicode_4BYTE_KIND:
5569 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5570 break;
5571 }
5572
5573 if (end == NULL) {
5574 _PyBytesWriter_Dealloc(&writer);
5575 return NULL;
5576 }
5577 return _PyBytesWriter_Finish(&writer, end);
5578 }
5579
5580 static int
unicode_fill_utf8(PyObject * unicode)5581 unicode_fill_utf8(PyObject *unicode)
5582 {
5583 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5584 assert(!PyUnicode_IS_ASCII(unicode));
5585
5586 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5587 const void *data = PyUnicode_DATA(unicode);
5588 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5589
5590 _PyBytesWriter writer;
5591 char *end;
5592
5593 switch (kind) {
5594 default:
5595 Py_UNREACHABLE();
5596 case PyUnicode_1BYTE_KIND:
5597 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5598 _Py_ERROR_STRICT, NULL);
5599 break;
5600 case PyUnicode_2BYTE_KIND:
5601 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5602 _Py_ERROR_STRICT, NULL);
5603 break;
5604 case PyUnicode_4BYTE_KIND:
5605 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5606 _Py_ERROR_STRICT, NULL);
5607 break;
5608 }
5609 if (end == NULL) {
5610 _PyBytesWriter_Dealloc(&writer);
5611 return -1;
5612 }
5613
5614 const char *start = writer.use_small_buffer ? writer.small_buffer :
5615 PyBytes_AS_STRING(writer.buffer);
5616 Py_ssize_t len = end - start;
5617
5618 char *cache = PyObject_Malloc(len + 1);
5619 if (cache == NULL) {
5620 _PyBytesWriter_Dealloc(&writer);
5621 PyErr_NoMemory();
5622 return -1;
5623 }
5624 _PyUnicode_UTF8(unicode) = cache;
5625 _PyUnicode_UTF8_LENGTH(unicode) = len;
5626 memcpy(cache, start, len);
5627 cache[len] = '\0';
5628 _PyBytesWriter_Dealloc(&writer);
5629 return 0;
5630 }
5631
5632 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5633 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5634 {
5635 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5636 }
5637
5638
5639 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5640 PyUnicode_AsUTF8String(PyObject *unicode)
5641 {
5642 return _PyUnicode_AsUTF8String(unicode, NULL);
5643 }
5644
5645 /* --- UTF-32 Codec ------------------------------------------------------- */
5646
5647 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5648 PyUnicode_DecodeUTF32(const char *s,
5649 Py_ssize_t size,
5650 const char *errors,
5651 int *byteorder)
5652 {
5653 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5654 }
5655
5656 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5657 PyUnicode_DecodeUTF32Stateful(const char *s,
5658 Py_ssize_t size,
5659 const char *errors,
5660 int *byteorder,
5661 Py_ssize_t *consumed)
5662 {
5663 const char *starts = s;
5664 Py_ssize_t startinpos;
5665 Py_ssize_t endinpos;
5666 _PyUnicodeWriter writer;
5667 const unsigned char *q, *e;
5668 int le, bo = 0; /* assume native ordering by default */
5669 const char *encoding;
5670 const char *errmsg = "";
5671 PyObject *errorHandler = NULL;
5672 PyObject *exc = NULL;
5673
5674 q = (const unsigned char *)s;
5675 e = q + size;
5676
5677 if (byteorder)
5678 bo = *byteorder;
5679
5680 /* Check for BOM marks (U+FEFF) in the input and adjust current
5681 byte order setting accordingly. In native mode, the leading BOM
5682 mark is skipped, in all other modes, it is copied to the output
5683 stream as-is (giving a ZWNBSP character). */
5684 if (bo == 0 && size >= 4) {
5685 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5686 if (bom == 0x0000FEFF) {
5687 bo = -1;
5688 q += 4;
5689 }
5690 else if (bom == 0xFFFE0000) {
5691 bo = 1;
5692 q += 4;
5693 }
5694 if (byteorder)
5695 *byteorder = bo;
5696 }
5697
5698 if (q == e) {
5699 if (consumed)
5700 *consumed = size;
5701 _Py_RETURN_UNICODE_EMPTY();
5702 }
5703
5704 #ifdef WORDS_BIGENDIAN
5705 le = bo < 0;
5706 #else
5707 le = bo <= 0;
5708 #endif
5709 encoding = le ? "utf-32-le" : "utf-32-be";
5710
5711 _PyUnicodeWriter_Init(&writer);
5712 writer.min_length = (e - q + 3) / 4;
5713 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5714 goto onError;
5715
5716 while (1) {
5717 Py_UCS4 ch = 0;
5718 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5719
5720 if (e - q >= 4) {
5721 enum PyUnicode_Kind kind = writer.kind;
5722 void *data = writer.data;
5723 const unsigned char *last = e - 4;
5724 Py_ssize_t pos = writer.pos;
5725 if (le) {
5726 do {
5727 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5728 if (ch > maxch)
5729 break;
5730 if (kind != PyUnicode_1BYTE_KIND &&
5731 Py_UNICODE_IS_SURROGATE(ch))
5732 break;
5733 PyUnicode_WRITE(kind, data, pos++, ch);
5734 q += 4;
5735 } while (q <= last);
5736 }
5737 else {
5738 do {
5739 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5740 if (ch > maxch)
5741 break;
5742 if (kind != PyUnicode_1BYTE_KIND &&
5743 Py_UNICODE_IS_SURROGATE(ch))
5744 break;
5745 PyUnicode_WRITE(kind, data, pos++, ch);
5746 q += 4;
5747 } while (q <= last);
5748 }
5749 writer.pos = pos;
5750 }
5751
5752 if (Py_UNICODE_IS_SURROGATE(ch)) {
5753 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5754 startinpos = ((const char *)q) - starts;
5755 endinpos = startinpos + 4;
5756 }
5757 else if (ch <= maxch) {
5758 if (q == e || consumed)
5759 break;
5760 /* remaining bytes at the end? (size should be divisible by 4) */
5761 errmsg = "truncated data";
5762 startinpos = ((const char *)q) - starts;
5763 endinpos = ((const char *)e) - starts;
5764 }
5765 else {
5766 if (ch < 0x110000) {
5767 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5768 goto onError;
5769 q += 4;
5770 continue;
5771 }
5772 errmsg = "code point not in range(0x110000)";
5773 startinpos = ((const char *)q) - starts;
5774 endinpos = startinpos + 4;
5775 }
5776
5777 /* The remaining input chars are ignored if the callback
5778 chooses to skip the input */
5779 if (unicode_decode_call_errorhandler_writer(
5780 errors, &errorHandler,
5781 encoding, errmsg,
5782 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5783 &writer))
5784 goto onError;
5785 }
5786
5787 if (consumed)
5788 *consumed = (const char *)q-starts;
5789
5790 Py_XDECREF(errorHandler);
5791 Py_XDECREF(exc);
5792 return _PyUnicodeWriter_Finish(&writer);
5793
5794 onError:
5795 _PyUnicodeWriter_Dealloc(&writer);
5796 Py_XDECREF(errorHandler);
5797 Py_XDECREF(exc);
5798 return NULL;
5799 }
5800
5801 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5802 _PyUnicode_EncodeUTF32(PyObject *str,
5803 const char *errors,
5804 int byteorder)
5805 {
5806 enum PyUnicode_Kind kind;
5807 const void *data;
5808 Py_ssize_t len;
5809 PyObject *v;
5810 uint32_t *out;
5811 #if PY_LITTLE_ENDIAN
5812 int native_ordering = byteorder <= 0;
5813 #else
5814 int native_ordering = byteorder >= 0;
5815 #endif
5816 const char *encoding;
5817 Py_ssize_t nsize, pos;
5818 PyObject *errorHandler = NULL;
5819 PyObject *exc = NULL;
5820 PyObject *rep = NULL;
5821
5822 if (!PyUnicode_Check(str)) {
5823 PyErr_BadArgument();
5824 return NULL;
5825 }
5826 if (PyUnicode_READY(str) == -1)
5827 return NULL;
5828 kind = PyUnicode_KIND(str);
5829 data = PyUnicode_DATA(str);
5830 len = PyUnicode_GET_LENGTH(str);
5831
5832 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5833 return PyErr_NoMemory();
5834 nsize = len + (byteorder == 0);
5835 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5836 if (v == NULL)
5837 return NULL;
5838
5839 /* output buffer is 4-bytes aligned */
5840 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5841 out = (uint32_t *)PyBytes_AS_STRING(v);
5842 if (byteorder == 0)
5843 *out++ = 0xFEFF;
5844 if (len == 0)
5845 goto done;
5846
5847 if (byteorder == -1)
5848 encoding = "utf-32-le";
5849 else if (byteorder == 1)
5850 encoding = "utf-32-be";
5851 else
5852 encoding = "utf-32";
5853
5854 if (kind == PyUnicode_1BYTE_KIND) {
5855 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5856 goto done;
5857 }
5858
5859 pos = 0;
5860 while (pos < len) {
5861 Py_ssize_t newpos, repsize, moreunits;
5862
5863 if (kind == PyUnicode_2BYTE_KIND) {
5864 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5865 &out, native_ordering);
5866 }
5867 else {
5868 assert(kind == PyUnicode_4BYTE_KIND);
5869 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5870 &out, native_ordering);
5871 }
5872 if (pos == len)
5873 break;
5874
5875 rep = unicode_encode_call_errorhandler(
5876 errors, &errorHandler,
5877 encoding, "surrogates not allowed",
5878 str, &exc, pos, pos + 1, &newpos);
5879 if (!rep)
5880 goto error;
5881
5882 if (PyBytes_Check(rep)) {
5883 repsize = PyBytes_GET_SIZE(rep);
5884 if (repsize & 3) {
5885 raise_encode_exception(&exc, encoding,
5886 str, pos, pos + 1,
5887 "surrogates not allowed");
5888 goto error;
5889 }
5890 moreunits = repsize / 4;
5891 }
5892 else {
5893 assert(PyUnicode_Check(rep));
5894 if (PyUnicode_READY(rep) < 0)
5895 goto error;
5896 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5897 if (!PyUnicode_IS_ASCII(rep)) {
5898 raise_encode_exception(&exc, encoding,
5899 str, pos, pos + 1,
5900 "surrogates not allowed");
5901 goto error;
5902 }
5903 }
5904 moreunits += pos - newpos;
5905 pos = newpos;
5906
5907 /* four bytes are reserved for each surrogate */
5908 if (moreunits > 0) {
5909 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5910 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5911 /* integer overflow */
5912 PyErr_NoMemory();
5913 goto error;
5914 }
5915 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * moreunits) < 0)
5916 goto error;
5917 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5918 }
5919
5920 if (PyBytes_Check(rep)) {
5921 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5922 out += repsize / 4;
5923 } else /* rep is unicode */ {
5924 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5925 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5926 &out, native_ordering);
5927 }
5928
5929 Py_CLEAR(rep);
5930 }
5931
5932 /* Cut back to size actually needed. This is necessary for, for example,
5933 encoding of a string containing isolated surrogates and the 'ignore'
5934 handler is used. */
5935 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5936 if (nsize != PyBytes_GET_SIZE(v))
5937 _PyBytes_Resize(&v, nsize);
5938 Py_XDECREF(errorHandler);
5939 Py_XDECREF(exc);
5940 done:
5941 return v;
5942 error:
5943 Py_XDECREF(rep);
5944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
5946 Py_XDECREF(v);
5947 return NULL;
5948 }
5949
5950 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5951 PyUnicode_AsUTF32String(PyObject *unicode)
5952 {
5953 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5954 }
5955
5956 /* --- UTF-16 Codec ------------------------------------------------------- */
5957
5958 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5959 PyUnicode_DecodeUTF16(const char *s,
5960 Py_ssize_t size,
5961 const char *errors,
5962 int *byteorder)
5963 {
5964 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5965 }
5966
5967 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5968 PyUnicode_DecodeUTF16Stateful(const char *s,
5969 Py_ssize_t size,
5970 const char *errors,
5971 int *byteorder,
5972 Py_ssize_t *consumed)
5973 {
5974 const char *starts = s;
5975 Py_ssize_t startinpos;
5976 Py_ssize_t endinpos;
5977 _PyUnicodeWriter writer;
5978 const unsigned char *q, *e;
5979 int bo = 0; /* assume native ordering by default */
5980 int native_ordering;
5981 const char *errmsg = "";
5982 PyObject *errorHandler = NULL;
5983 PyObject *exc = NULL;
5984 const char *encoding;
5985
5986 q = (const unsigned char *)s;
5987 e = q + size;
5988
5989 if (byteorder)
5990 bo = *byteorder;
5991
5992 /* Check for BOM marks (U+FEFF) in the input and adjust current
5993 byte order setting accordingly. In native mode, the leading BOM
5994 mark is skipped, in all other modes, it is copied to the output
5995 stream as-is (giving a ZWNBSP character). */
5996 if (bo == 0 && size >= 2) {
5997 const Py_UCS4 bom = (q[1] << 8) | q[0];
5998 if (bom == 0xFEFF) {
5999 q += 2;
6000 bo = -1;
6001 }
6002 else if (bom == 0xFFFE) {
6003 q += 2;
6004 bo = 1;
6005 }
6006 if (byteorder)
6007 *byteorder = bo;
6008 }
6009
6010 if (q == e) {
6011 if (consumed)
6012 *consumed = size;
6013 _Py_RETURN_UNICODE_EMPTY();
6014 }
6015
6016 #if PY_LITTLE_ENDIAN
6017 native_ordering = bo <= 0;
6018 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6019 #else
6020 native_ordering = bo >= 0;
6021 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6022 #endif
6023
6024 /* Note: size will always be longer than the resulting Unicode
6025 character count normally. Error handler will take care of
6026 resizing when needed. */
6027 _PyUnicodeWriter_Init(&writer);
6028 writer.min_length = (e - q + 1) / 2;
6029 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6030 goto onError;
6031
6032 while (1) {
6033 Py_UCS4 ch = 0;
6034 if (e - q >= 2) {
6035 int kind = writer.kind;
6036 if (kind == PyUnicode_1BYTE_KIND) {
6037 if (PyUnicode_IS_ASCII(writer.buffer))
6038 ch = asciilib_utf16_decode(&q, e,
6039 (Py_UCS1*)writer.data, &writer.pos,
6040 native_ordering);
6041 else
6042 ch = ucs1lib_utf16_decode(&q, e,
6043 (Py_UCS1*)writer.data, &writer.pos,
6044 native_ordering);
6045 } else if (kind == PyUnicode_2BYTE_KIND) {
6046 ch = ucs2lib_utf16_decode(&q, e,
6047 (Py_UCS2*)writer.data, &writer.pos,
6048 native_ordering);
6049 } else {
6050 assert(kind == PyUnicode_4BYTE_KIND);
6051 ch = ucs4lib_utf16_decode(&q, e,
6052 (Py_UCS4*)writer.data, &writer.pos,
6053 native_ordering);
6054 }
6055 }
6056
6057 switch (ch)
6058 {
6059 case 0:
6060 /* remaining byte at the end? (size should be even) */
6061 if (q == e || consumed)
6062 goto End;
6063 errmsg = "truncated data";
6064 startinpos = ((const char *)q) - starts;
6065 endinpos = ((const char *)e) - starts;
6066 break;
6067 /* The remaining input chars are ignored if the callback
6068 chooses to skip the input */
6069 case 1:
6070 q -= 2;
6071 if (consumed)
6072 goto End;
6073 errmsg = "unexpected end of data";
6074 startinpos = ((const char *)q) - starts;
6075 endinpos = ((const char *)e) - starts;
6076 break;
6077 case 2:
6078 errmsg = "illegal encoding";
6079 startinpos = ((const char *)q) - 2 - starts;
6080 endinpos = startinpos + 2;
6081 break;
6082 case 3:
6083 errmsg = "illegal UTF-16 surrogate";
6084 startinpos = ((const char *)q) - 4 - starts;
6085 endinpos = startinpos + 2;
6086 break;
6087 default:
6088 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6089 goto onError;
6090 continue;
6091 }
6092
6093 if (unicode_decode_call_errorhandler_writer(
6094 errors,
6095 &errorHandler,
6096 encoding, errmsg,
6097 &starts,
6098 (const char **)&e,
6099 &startinpos,
6100 &endinpos,
6101 &exc,
6102 (const char **)&q,
6103 &writer))
6104 goto onError;
6105 }
6106
6107 End:
6108 if (consumed)
6109 *consumed = (const char *)q-starts;
6110
6111 Py_XDECREF(errorHandler);
6112 Py_XDECREF(exc);
6113 return _PyUnicodeWriter_Finish(&writer);
6114
6115 onError:
6116 _PyUnicodeWriter_Dealloc(&writer);
6117 Py_XDECREF(errorHandler);
6118 Py_XDECREF(exc);
6119 return NULL;
6120 }
6121
6122 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)6123 _PyUnicode_EncodeUTF16(PyObject *str,
6124 const char *errors,
6125 int byteorder)
6126 {
6127 enum PyUnicode_Kind kind;
6128 const void *data;
6129 Py_ssize_t len;
6130 PyObject *v;
6131 unsigned short *out;
6132 Py_ssize_t pairs;
6133 #if PY_BIG_ENDIAN
6134 int native_ordering = byteorder >= 0;
6135 #else
6136 int native_ordering = byteorder <= 0;
6137 #endif
6138 const char *encoding;
6139 Py_ssize_t nsize, pos;
6140 PyObject *errorHandler = NULL;
6141 PyObject *exc = NULL;
6142 PyObject *rep = NULL;
6143
6144 if (!PyUnicode_Check(str)) {
6145 PyErr_BadArgument();
6146 return NULL;
6147 }
6148 if (PyUnicode_READY(str) == -1)
6149 return NULL;
6150 kind = PyUnicode_KIND(str);
6151 data = PyUnicode_DATA(str);
6152 len = PyUnicode_GET_LENGTH(str);
6153
6154 pairs = 0;
6155 if (kind == PyUnicode_4BYTE_KIND) {
6156 const Py_UCS4 *in = (const Py_UCS4 *)data;
6157 const Py_UCS4 *end = in + len;
6158 while (in < end) {
6159 if (*in++ >= 0x10000) {
6160 pairs++;
6161 }
6162 }
6163 }
6164 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6165 return PyErr_NoMemory();
6166 }
6167 nsize = len + pairs + (byteorder == 0);
6168 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6169 if (v == NULL) {
6170 return NULL;
6171 }
6172
6173 /* output buffer is 2-bytes aligned */
6174 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6175 out = (unsigned short *)PyBytes_AS_STRING(v);
6176 if (byteorder == 0) {
6177 *out++ = 0xFEFF;
6178 }
6179 if (len == 0) {
6180 goto done;
6181 }
6182
6183 if (kind == PyUnicode_1BYTE_KIND) {
6184 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6185 goto done;
6186 }
6187
6188 if (byteorder < 0) {
6189 encoding = "utf-16-le";
6190 }
6191 else if (byteorder > 0) {
6192 encoding = "utf-16-be";
6193 }
6194 else {
6195 encoding = "utf-16";
6196 }
6197
6198 pos = 0;
6199 while (pos < len) {
6200 Py_ssize_t newpos, repsize, moreunits;
6201
6202 if (kind == PyUnicode_2BYTE_KIND) {
6203 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6204 &out, native_ordering);
6205 }
6206 else {
6207 assert(kind == PyUnicode_4BYTE_KIND);
6208 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6209 &out, native_ordering);
6210 }
6211 if (pos == len)
6212 break;
6213
6214 rep = unicode_encode_call_errorhandler(
6215 errors, &errorHandler,
6216 encoding, "surrogates not allowed",
6217 str, &exc, pos, pos + 1, &newpos);
6218 if (!rep)
6219 goto error;
6220
6221 if (PyBytes_Check(rep)) {
6222 repsize = PyBytes_GET_SIZE(rep);
6223 if (repsize & 1) {
6224 raise_encode_exception(&exc, encoding,
6225 str, pos, pos + 1,
6226 "surrogates not allowed");
6227 goto error;
6228 }
6229 moreunits = repsize / 2;
6230 }
6231 else {
6232 assert(PyUnicode_Check(rep));
6233 if (PyUnicode_READY(rep) < 0)
6234 goto error;
6235 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6236 if (!PyUnicode_IS_ASCII(rep)) {
6237 raise_encode_exception(&exc, encoding,
6238 str, pos, pos + 1,
6239 "surrogates not allowed");
6240 goto error;
6241 }
6242 }
6243 moreunits += pos - newpos;
6244 pos = newpos;
6245
6246 /* two bytes are reserved for each surrogate */
6247 if (moreunits > 0) {
6248 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
6249 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
6250 /* integer overflow */
6251 PyErr_NoMemory();
6252 goto error;
6253 }
6254 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * moreunits) < 0)
6255 goto error;
6256 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6257 }
6258
6259 if (PyBytes_Check(rep)) {
6260 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6261 out += repsize / 2;
6262 } else /* rep is unicode */ {
6263 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6264 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6265 &out, native_ordering);
6266 }
6267
6268 Py_CLEAR(rep);
6269 }
6270
6271 /* Cut back to size actually needed. This is necessary for, for example,
6272 encoding of a string containing isolated surrogates and the 'ignore' handler
6273 is used. */
6274 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6275 if (nsize != PyBytes_GET_SIZE(v))
6276 _PyBytes_Resize(&v, nsize);
6277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
6279 done:
6280 return v;
6281 error:
6282 Py_XDECREF(rep);
6283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
6285 Py_XDECREF(v);
6286 return NULL;
6287 #undef STORECHAR
6288 }
6289
6290 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6291 PyUnicode_AsUTF16String(PyObject *unicode)
6292 {
6293 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6294 }
6295
6296 /* --- Unicode Escape Codec ----------------------------------------------- */
6297
6298 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
6299
6300 PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed,const char ** first_invalid_escape)6301 _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
6302 Py_ssize_t size,
6303 const char *errors,
6304 Py_ssize_t *consumed,
6305 const char **first_invalid_escape)
6306 {
6307 const char *starts = s;
6308 _PyUnicodeWriter writer;
6309 const char *end;
6310 PyObject *errorHandler = NULL;
6311 PyObject *exc = NULL;
6312
6313 // so we can remember if we've seen an invalid escape char or not
6314 *first_invalid_escape = NULL;
6315
6316 if (size == 0) {
6317 if (consumed) {
6318 *consumed = 0;
6319 }
6320 _Py_RETURN_UNICODE_EMPTY();
6321 }
6322 /* Escaped strings will always be longer than the resulting
6323 Unicode string, so we start with size here and then reduce the
6324 length after conversion to the true value.
6325 (but if the error callback returns a long replacement string
6326 we'll have to allocate more space) */
6327 _PyUnicodeWriter_Init(&writer);
6328 writer.min_length = size;
6329 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6330 goto onError;
6331 }
6332
6333 end = s + size;
6334 while (s < end) {
6335 unsigned char c = (unsigned char) *s++;
6336 Py_UCS4 ch;
6337 int count;
6338 const char *message;
6339
6340 #define WRITE_ASCII_CHAR(ch) \
6341 do { \
6342 assert(ch <= 127); \
6343 assert(writer.pos < writer.size); \
6344 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6345 } while(0)
6346
6347 #define WRITE_CHAR(ch) \
6348 do { \
6349 if (ch <= writer.maxchar) { \
6350 assert(writer.pos < writer.size); \
6351 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6352 } \
6353 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6354 goto onError; \
6355 } \
6356 } while(0)
6357
6358 /* Non-escape characters are interpreted as Unicode ordinals */
6359 if (c != '\\') {
6360 WRITE_CHAR(c);
6361 continue;
6362 }
6363
6364 Py_ssize_t startinpos = s - starts - 1;
6365 /* \ - Escapes */
6366 if (s >= end) {
6367 message = "\\ at end of string";
6368 goto incomplete;
6369 }
6370 c = (unsigned char) *s++;
6371
6372 assert(writer.pos < writer.size);
6373 switch (c) {
6374
6375 /* \x escapes */
6376 case '\n': continue;
6377 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6378 case '\'': WRITE_ASCII_CHAR('\''); continue;
6379 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6380 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6381 /* FF */
6382 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6383 case 't': WRITE_ASCII_CHAR('\t'); continue;
6384 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6385 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6386 /* VT */
6387 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6388 /* BEL, not classic C */
6389 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6390
6391 /* \OOO (octal) escapes */
6392 case '0': case '1': case '2': case '3':
6393 case '4': case '5': case '6': case '7':
6394 ch = c - '0';
6395 if (s < end && '0' <= *s && *s <= '7') {
6396 ch = (ch<<3) + *s++ - '0';
6397 if (s < end && '0' <= *s && *s <= '7') {
6398 ch = (ch<<3) + *s++ - '0';
6399 }
6400 }
6401 if (ch > 0377) {
6402 if (*first_invalid_escape == NULL) {
6403 *first_invalid_escape = s-3; /* Back up 3 chars, since we've
6404 already incremented s. */
6405 }
6406 }
6407 WRITE_CHAR(ch);
6408 continue;
6409
6410 /* hex escapes */
6411 /* \xXX */
6412 case 'x':
6413 count = 2;
6414 message = "truncated \\xXX escape";
6415 goto hexescape;
6416
6417 /* \uXXXX */
6418 case 'u':
6419 count = 4;
6420 message = "truncated \\uXXXX escape";
6421 goto hexescape;
6422
6423 /* \UXXXXXXXX */
6424 case 'U':
6425 count = 8;
6426 message = "truncated \\UXXXXXXXX escape";
6427 hexescape:
6428 for (ch = 0; count; ++s, --count) {
6429 if (s >= end) {
6430 goto incomplete;
6431 }
6432 c = (unsigned char)*s;
6433 ch <<= 4;
6434 if (c >= '0' && c <= '9') {
6435 ch += c - '0';
6436 }
6437 else if (c >= 'a' && c <= 'f') {
6438 ch += c - ('a' - 10);
6439 }
6440 else if (c >= 'A' && c <= 'F') {
6441 ch += c - ('A' - 10);
6442 }
6443 else {
6444 goto error;
6445 }
6446 }
6447
6448 /* when we get here, ch is a 32-bit unicode character */
6449 if (ch > MAX_UNICODE) {
6450 message = "illegal Unicode character";
6451 goto error;
6452 }
6453
6454 WRITE_CHAR(ch);
6455 continue;
6456
6457 /* \N{name} */
6458 case 'N':
6459 if (ucnhash_capi == NULL) {
6460 /* load the unicode data module */
6461 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6462 PyUnicodeData_CAPSULE_NAME, 1);
6463 if (ucnhash_capi == NULL) {
6464 PyErr_SetString(
6465 PyExc_UnicodeError,
6466 "\\N escapes not supported (can't load unicodedata module)"
6467 );
6468 goto onError;
6469 }
6470 }
6471
6472 message = "malformed \\N character escape";
6473 if (s >= end) {
6474 goto incomplete;
6475 }
6476 if (*s == '{') {
6477 const char *start = ++s;
6478 size_t namelen;
6479 /* look for the closing brace */
6480 while (s < end && *s != '}')
6481 s++;
6482 if (s >= end) {
6483 goto incomplete;
6484 }
6485 namelen = s - start;
6486 if (namelen) {
6487 /* found a name. look it up in the unicode database */
6488 s++;
6489 ch = 0xffffffff; /* in case 'getcode' messes up */
6490 if (namelen <= INT_MAX &&
6491 ucnhash_capi->getcode(start, (int)namelen,
6492 &ch, 0)) {
6493 assert(ch <= MAX_UNICODE);
6494 WRITE_CHAR(ch);
6495 continue;
6496 }
6497 message = "unknown Unicode character name";
6498 }
6499 }
6500 goto error;
6501
6502 default:
6503 if (*first_invalid_escape == NULL) {
6504 *first_invalid_escape = s-1; /* Back up one char, since we've
6505 already incremented s. */
6506 }
6507 WRITE_ASCII_CHAR('\\');
6508 WRITE_CHAR(c);
6509 continue;
6510 }
6511
6512 incomplete:
6513 if (consumed) {
6514 *consumed = startinpos;
6515 break;
6516 }
6517 error:;
6518 Py_ssize_t endinpos = s-starts;
6519 writer.min_length = end - s + writer.pos;
6520 if (unicode_decode_call_errorhandler_writer(
6521 errors, &errorHandler,
6522 "unicodeescape", message,
6523 &starts, &end, &startinpos, &endinpos, &exc, &s,
6524 &writer)) {
6525 goto onError;
6526 }
6527 assert(end - s <= writer.size - writer.pos);
6528
6529 #undef WRITE_ASCII_CHAR
6530 #undef WRITE_CHAR
6531 }
6532
6533 Py_XDECREF(errorHandler);
6534 Py_XDECREF(exc);
6535 return _PyUnicodeWriter_Finish(&writer);
6536
6537 onError:
6538 _PyUnicodeWriter_Dealloc(&writer);
6539 Py_XDECREF(errorHandler);
6540 Py_XDECREF(exc);
6541 return NULL;
6542 }
6543
6544 PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6545 _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6546 Py_ssize_t size,
6547 const char *errors,
6548 Py_ssize_t *consumed)
6549 {
6550 const char *first_invalid_escape;
6551 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6552 consumed,
6553 &first_invalid_escape);
6554 if (result == NULL)
6555 return NULL;
6556 if (first_invalid_escape != NULL) {
6557 unsigned char c = *first_invalid_escape;
6558 if ('4' <= c && c <= '7') {
6559 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6560 "invalid octal escape sequence '\\%.3s'",
6561 first_invalid_escape) < 0)
6562 {
6563 Py_DECREF(result);
6564 return NULL;
6565 }
6566 }
6567 else {
6568 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6569 "invalid escape sequence '\\%c'",
6570 c) < 0)
6571 {
6572 Py_DECREF(result);
6573 return NULL;
6574 }
6575 }
6576 }
6577 return result;
6578 }
6579
6580 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6581 PyUnicode_DecodeUnicodeEscape(const char *s,
6582 Py_ssize_t size,
6583 const char *errors)
6584 {
6585 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6586 }
6587
6588 /* Return a Unicode-Escape string version of the Unicode object. */
6589
6590 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6591 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6592 {
6593 Py_ssize_t i, len;
6594 PyObject *repr;
6595 char *p;
6596 enum PyUnicode_Kind kind;
6597 const void *data;
6598 Py_ssize_t expandsize;
6599
6600 /* Initial allocation is based on the longest-possible character
6601 escape.
6602
6603 For UCS1 strings it's '\xxx', 4 bytes per source character.
6604 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6605 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6606 */
6607
6608 if (!PyUnicode_Check(unicode)) {
6609 PyErr_BadArgument();
6610 return NULL;
6611 }
6612 if (PyUnicode_READY(unicode) == -1) {
6613 return NULL;
6614 }
6615
6616 len = PyUnicode_GET_LENGTH(unicode);
6617 if (len == 0) {
6618 return PyBytes_FromStringAndSize(NULL, 0);
6619 }
6620
6621 kind = PyUnicode_KIND(unicode);
6622 data = PyUnicode_DATA(unicode);
6623 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6624 bytes, and 1 byte characters 4. */
6625 expandsize = kind * 2 + 2;
6626 if (len > PY_SSIZE_T_MAX / expandsize) {
6627 return PyErr_NoMemory();
6628 }
6629 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6630 if (repr == NULL) {
6631 return NULL;
6632 }
6633
6634 p = PyBytes_AS_STRING(repr);
6635 for (i = 0; i < len; i++) {
6636 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6637
6638 /* U+0000-U+00ff range */
6639 if (ch < 0x100) {
6640 if (ch >= ' ' && ch < 127) {
6641 if (ch != '\\') {
6642 /* Copy printable US ASCII as-is */
6643 *p++ = (char) ch;
6644 }
6645 /* Escape backslashes */
6646 else {
6647 *p++ = '\\';
6648 *p++ = '\\';
6649 }
6650 }
6651
6652 /* Map special whitespace to '\t', \n', '\r' */
6653 else if (ch == '\t') {
6654 *p++ = '\\';
6655 *p++ = 't';
6656 }
6657 else if (ch == '\n') {
6658 *p++ = '\\';
6659 *p++ = 'n';
6660 }
6661 else if (ch == '\r') {
6662 *p++ = '\\';
6663 *p++ = 'r';
6664 }
6665
6666 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6667 else {
6668 *p++ = '\\';
6669 *p++ = 'x';
6670 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6671 *p++ = Py_hexdigits[ch & 0x000F];
6672 }
6673 }
6674 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6675 else if (ch < 0x10000) {
6676 *p++ = '\\';
6677 *p++ = 'u';
6678 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6679 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6680 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6681 *p++ = Py_hexdigits[ch & 0x000F];
6682 }
6683 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6684 else {
6685
6686 /* Make sure that the first two digits are zero */
6687 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6688 *p++ = '\\';
6689 *p++ = 'U';
6690 *p++ = '0';
6691 *p++ = '0';
6692 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6693 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6694 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6695 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6696 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6697 *p++ = Py_hexdigits[ch & 0x0000000F];
6698 }
6699 }
6700
6701 assert(p - PyBytes_AS_STRING(repr) > 0);
6702 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6703 return NULL;
6704 }
6705 return repr;
6706 }
6707
6708 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6709
6710 PyObject *
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)6711 _PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6712 Py_ssize_t size,
6713 const char *errors,
6714 Py_ssize_t *consumed)
6715 {
6716 const char *starts = s;
6717 _PyUnicodeWriter writer;
6718 const char *end;
6719 PyObject *errorHandler = NULL;
6720 PyObject *exc = NULL;
6721
6722 if (size == 0) {
6723 if (consumed) {
6724 *consumed = 0;
6725 }
6726 _Py_RETURN_UNICODE_EMPTY();
6727 }
6728
6729 /* Escaped strings will always be longer than the resulting
6730 Unicode string, so we start with size here and then reduce the
6731 length after conversion to the true value. (But decoding error
6732 handler might have to resize the string) */
6733 _PyUnicodeWriter_Init(&writer);
6734 writer.min_length = size;
6735 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6736 goto onError;
6737 }
6738
6739 end = s + size;
6740 while (s < end) {
6741 unsigned char c = (unsigned char) *s++;
6742 Py_UCS4 ch;
6743 int count;
6744 const char *message;
6745
6746 #define WRITE_CHAR(ch) \
6747 do { \
6748 if (ch <= writer.maxchar) { \
6749 assert(writer.pos < writer.size); \
6750 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6751 } \
6752 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6753 goto onError; \
6754 } \
6755 } while(0)
6756
6757 /* Non-escape characters are interpreted as Unicode ordinals */
6758 if (c != '\\' || (s >= end && !consumed)) {
6759 WRITE_CHAR(c);
6760 continue;
6761 }
6762
6763 Py_ssize_t startinpos = s - starts - 1;
6764 /* \ - Escapes */
6765 if (s >= end) {
6766 assert(consumed);
6767 // Set message to silent compiler warning.
6768 // Actually it is never used.
6769 message = "\\ at end of string";
6770 goto incomplete;
6771 }
6772
6773 c = (unsigned char) *s++;
6774 if (c == 'u') {
6775 count = 4;
6776 message = "truncated \\uXXXX escape";
6777 }
6778 else if (c == 'U') {
6779 count = 8;
6780 message = "truncated \\UXXXXXXXX escape";
6781 }
6782 else {
6783 assert(writer.pos < writer.size);
6784 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6785 WRITE_CHAR(c);
6786 continue;
6787 }
6788
6789 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6790 for (ch = 0; count; ++s, --count) {
6791 if (s >= end) {
6792 goto incomplete;
6793 }
6794 c = (unsigned char)*s;
6795 ch <<= 4;
6796 if (c >= '0' && c <= '9') {
6797 ch += c - '0';
6798 }
6799 else if (c >= 'a' && c <= 'f') {
6800 ch += c - ('a' - 10);
6801 }
6802 else if (c >= 'A' && c <= 'F') {
6803 ch += c - ('A' - 10);
6804 }
6805 else {
6806 goto error;
6807 }
6808 }
6809 if (ch > MAX_UNICODE) {
6810 message = "\\Uxxxxxxxx out of range";
6811 goto error;
6812 }
6813 WRITE_CHAR(ch);
6814 continue;
6815
6816 incomplete:
6817 if (consumed) {
6818 *consumed = startinpos;
6819 break;
6820 }
6821 error:;
6822 Py_ssize_t endinpos = s-starts;
6823 writer.min_length = end - s + writer.pos;
6824 if (unicode_decode_call_errorhandler_writer(
6825 errors, &errorHandler,
6826 "rawunicodeescape", message,
6827 &starts, &end, &startinpos, &endinpos, &exc, &s,
6828 &writer)) {
6829 goto onError;
6830 }
6831 assert(end - s <= writer.size - writer.pos);
6832
6833 #undef WRITE_CHAR
6834 }
6835 Py_XDECREF(errorHandler);
6836 Py_XDECREF(exc);
6837 return _PyUnicodeWriter_Finish(&writer);
6838
6839 onError:
6840 _PyUnicodeWriter_Dealloc(&writer);
6841 Py_XDECREF(errorHandler);
6842 Py_XDECREF(exc);
6843 return NULL;
6844 }
6845
6846 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6847 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6848 Py_ssize_t size,
6849 const char *errors)
6850 {
6851 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6852 }
6853
6854
6855 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6856 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6857 {
6858 PyObject *repr;
6859 char *p;
6860 Py_ssize_t expandsize, pos;
6861 int kind;
6862 const void *data;
6863 Py_ssize_t len;
6864
6865 if (!PyUnicode_Check(unicode)) {
6866 PyErr_BadArgument();
6867 return NULL;
6868 }
6869 if (PyUnicode_READY(unicode) == -1) {
6870 return NULL;
6871 }
6872 kind = PyUnicode_KIND(unicode);
6873 data = PyUnicode_DATA(unicode);
6874 len = PyUnicode_GET_LENGTH(unicode);
6875 if (kind == PyUnicode_1BYTE_KIND) {
6876 return PyBytes_FromStringAndSize(data, len);
6877 }
6878
6879 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6880 bytes, and 1 byte characters 4. */
6881 expandsize = kind * 2 + 2;
6882
6883 if (len > PY_SSIZE_T_MAX / expandsize) {
6884 return PyErr_NoMemory();
6885 }
6886 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6887 if (repr == NULL) {
6888 return NULL;
6889 }
6890 if (len == 0) {
6891 return repr;
6892 }
6893
6894 p = PyBytes_AS_STRING(repr);
6895 for (pos = 0; pos < len; pos++) {
6896 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6897
6898 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6899 if (ch < 0x100) {
6900 *p++ = (char) ch;
6901 }
6902 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6903 else if (ch < 0x10000) {
6904 *p++ = '\\';
6905 *p++ = 'u';
6906 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6907 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6908 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6909 *p++ = Py_hexdigits[ch & 15];
6910 }
6911 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6912 else {
6913 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6914 *p++ = '\\';
6915 *p++ = 'U';
6916 *p++ = '0';
6917 *p++ = '0';
6918 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6919 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6920 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6921 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6922 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6923 *p++ = Py_hexdigits[ch & 15];
6924 }
6925 }
6926
6927 assert(p > PyBytes_AS_STRING(repr));
6928 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6929 return NULL;
6930 }
6931 return repr;
6932 }
6933
6934 /* --- Latin-1 Codec ------------------------------------------------------ */
6935
6936 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6937 PyUnicode_DecodeLatin1(const char *s,
6938 Py_ssize_t size,
6939 const char *errors)
6940 {
6941 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6942 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6943 }
6944
6945 /* create or adjust a UnicodeEncodeError */
6946 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6947 make_encode_exception(PyObject **exceptionObject,
6948 const char *encoding,
6949 PyObject *unicode,
6950 Py_ssize_t startpos, Py_ssize_t endpos,
6951 const char *reason)
6952 {
6953 if (*exceptionObject == NULL) {
6954 *exceptionObject = PyObject_CallFunction(
6955 PyExc_UnicodeEncodeError, "sOnns",
6956 encoding, unicode, startpos, endpos, reason);
6957 }
6958 else {
6959 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6960 goto onError;
6961 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6962 goto onError;
6963 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6964 goto onError;
6965 return;
6966 onError:
6967 Py_CLEAR(*exceptionObject);
6968 }
6969 }
6970
6971 /* raises a UnicodeEncodeError */
6972 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6973 raise_encode_exception(PyObject **exceptionObject,
6974 const char *encoding,
6975 PyObject *unicode,
6976 Py_ssize_t startpos, Py_ssize_t endpos,
6977 const char *reason)
6978 {
6979 make_encode_exception(exceptionObject,
6980 encoding, unicode, startpos, endpos, reason);
6981 if (*exceptionObject != NULL)
6982 PyCodec_StrictErrors(*exceptionObject);
6983 }
6984
6985 /* error handling callback helper:
6986 build arguments, call the callback and check the arguments,
6987 put the result into newpos and return the replacement string, which
6988 has to be freed by the caller */
6989 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6990 unicode_encode_call_errorhandler(const char *errors,
6991 PyObject **errorHandler,
6992 const char *encoding, const char *reason,
6993 PyObject *unicode, PyObject **exceptionObject,
6994 Py_ssize_t startpos, Py_ssize_t endpos,
6995 Py_ssize_t *newpos)
6996 {
6997 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6998 Py_ssize_t len;
6999 PyObject *restuple;
7000 PyObject *resunicode;
7001
7002 if (*errorHandler == NULL) {
7003 *errorHandler = PyCodec_LookupError(errors);
7004 if (*errorHandler == NULL)
7005 return NULL;
7006 }
7007
7008 if (PyUnicode_READY(unicode) == -1)
7009 return NULL;
7010 len = PyUnicode_GET_LENGTH(unicode);
7011
7012 make_encode_exception(exceptionObject,
7013 encoding, unicode, startpos, endpos, reason);
7014 if (*exceptionObject == NULL)
7015 return NULL;
7016
7017 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7018 if (restuple == NULL)
7019 return NULL;
7020 if (!PyTuple_Check(restuple)) {
7021 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7022 Py_DECREF(restuple);
7023 return NULL;
7024 }
7025 if (!PyArg_ParseTuple(restuple, argparse,
7026 &resunicode, newpos)) {
7027 Py_DECREF(restuple);
7028 return NULL;
7029 }
7030 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7031 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7032 Py_DECREF(restuple);
7033 return NULL;
7034 }
7035 if (*newpos<0)
7036 *newpos = len + *newpos;
7037 if (*newpos<0 || *newpos>len) {
7038 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7039 Py_DECREF(restuple);
7040 return NULL;
7041 }
7042 Py_INCREF(resunicode);
7043 Py_DECREF(restuple);
7044 return resunicode;
7045 }
7046
7047 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)7048 unicode_encode_ucs1(PyObject *unicode,
7049 const char *errors,
7050 const Py_UCS4 limit)
7051 {
7052 /* input state */
7053 Py_ssize_t pos=0, size;
7054 int kind;
7055 const void *data;
7056 /* pointer into the output */
7057 char *str;
7058 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7059 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7060 PyObject *error_handler_obj = NULL;
7061 PyObject *exc = NULL;
7062 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7063 PyObject *rep = NULL;
7064 /* output object */
7065 _PyBytesWriter writer;
7066
7067 if (PyUnicode_READY(unicode) == -1)
7068 return NULL;
7069 size = PyUnicode_GET_LENGTH(unicode);
7070 kind = PyUnicode_KIND(unicode);
7071 data = PyUnicode_DATA(unicode);
7072 /* allocate enough for a simple encoding without
7073 replacements, if we need more, we'll resize */
7074 if (size == 0)
7075 return PyBytes_FromStringAndSize(NULL, 0);
7076
7077 _PyBytesWriter_Init(&writer);
7078 str = _PyBytesWriter_Alloc(&writer, size);
7079 if (str == NULL)
7080 return NULL;
7081
7082 while (pos < size) {
7083 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7084
7085 /* can we encode this? */
7086 if (ch < limit) {
7087 /* no overflow check, because we know that the space is enough */
7088 *str++ = (char)ch;
7089 ++pos;
7090 }
7091 else {
7092 Py_ssize_t newpos, i;
7093 /* startpos for collecting unencodable chars */
7094 Py_ssize_t collstart = pos;
7095 Py_ssize_t collend = collstart + 1;
7096 /* find all unecodable characters */
7097
7098 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7099 ++collend;
7100
7101 /* Only overallocate the buffer if it's not the last write */
7102 writer.overallocate = (collend < size);
7103
7104 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7105 if (error_handler == _Py_ERROR_UNKNOWN)
7106 error_handler = _Py_GetErrorHandler(errors);
7107
7108 switch (error_handler) {
7109 case _Py_ERROR_STRICT:
7110 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7111 goto onError;
7112
7113 case _Py_ERROR_REPLACE:
7114 memset(str, '?', collend - collstart);
7115 str += (collend - collstart);
7116 /* fall through */
7117 case _Py_ERROR_IGNORE:
7118 pos = collend;
7119 break;
7120
7121 case _Py_ERROR_BACKSLASHREPLACE:
7122 /* subtract preallocated bytes */
7123 writer.min_size -= (collend - collstart);
7124 str = backslashreplace(&writer, str,
7125 unicode, collstart, collend);
7126 if (str == NULL)
7127 goto onError;
7128 pos = collend;
7129 break;
7130
7131 case _Py_ERROR_XMLCHARREFREPLACE:
7132 /* subtract preallocated bytes */
7133 writer.min_size -= (collend - collstart);
7134 str = xmlcharrefreplace(&writer, str,
7135 unicode, collstart, collend);
7136 if (str == NULL)
7137 goto onError;
7138 pos = collend;
7139 break;
7140
7141 case _Py_ERROR_SURROGATEESCAPE:
7142 for (i = collstart; i < collend; ++i) {
7143 ch = PyUnicode_READ(kind, data, i);
7144 if (ch < 0xdc80 || 0xdcff < ch) {
7145 /* Not a UTF-8b surrogate */
7146 break;
7147 }
7148 *str++ = (char)(ch - 0xdc00);
7149 ++pos;
7150 }
7151 if (i >= collend)
7152 break;
7153 collstart = pos;
7154 assert(collstart != collend);
7155 /* fall through */
7156
7157 default:
7158 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7159 encoding, reason, unicode, &exc,
7160 collstart, collend, &newpos);
7161 if (rep == NULL)
7162 goto onError;
7163
7164 if (newpos < collstart) {
7165 writer.overallocate = 1;
7166 str = _PyBytesWriter_Prepare(&writer, str,
7167 collstart - newpos);
7168 if (str == NULL)
7169 goto onError;
7170 }
7171 else {
7172 /* subtract preallocated bytes */
7173 writer.min_size -= newpos - collstart;
7174 /* Only overallocate the buffer if it's not the last write */
7175 writer.overallocate = (newpos < size);
7176 }
7177
7178 if (PyBytes_Check(rep)) {
7179 /* Directly copy bytes result to output. */
7180 str = _PyBytesWriter_WriteBytes(&writer, str,
7181 PyBytes_AS_STRING(rep),
7182 PyBytes_GET_SIZE(rep));
7183 }
7184 else {
7185 assert(PyUnicode_Check(rep));
7186
7187 if (PyUnicode_READY(rep) < 0)
7188 goto onError;
7189
7190 if (limit == 256 ?
7191 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7192 !PyUnicode_IS_ASCII(rep))
7193 {
7194 /* Not all characters are smaller than limit */
7195 raise_encode_exception(&exc, encoding, unicode,
7196 collstart, collend, reason);
7197 goto onError;
7198 }
7199 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7200 str = _PyBytesWriter_WriteBytes(&writer, str,
7201 PyUnicode_DATA(rep),
7202 PyUnicode_GET_LENGTH(rep));
7203 }
7204 if (str == NULL)
7205 goto onError;
7206
7207 pos = newpos;
7208 Py_CLEAR(rep);
7209 }
7210
7211 /* If overallocation was disabled, ensure that it was the last
7212 write. Otherwise, we missed an optimization */
7213 assert(writer.overallocate || pos == size);
7214 }
7215 }
7216
7217 Py_XDECREF(error_handler_obj);
7218 Py_XDECREF(exc);
7219 return _PyBytesWriter_Finish(&writer, str);
7220
7221 onError:
7222 Py_XDECREF(rep);
7223 _PyBytesWriter_Dealloc(&writer);
7224 Py_XDECREF(error_handler_obj);
7225 Py_XDECREF(exc);
7226 return NULL;
7227 }
7228
7229 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)7230 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7231 {
7232 if (!PyUnicode_Check(unicode)) {
7233 PyErr_BadArgument();
7234 return NULL;
7235 }
7236 if (PyUnicode_READY(unicode) == -1)
7237 return NULL;
7238 /* Fast path: if it is a one-byte string, construct
7239 bytes object directly. */
7240 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7241 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7242 PyUnicode_GET_LENGTH(unicode));
7243 /* Non-Latin-1 characters present. Defer to above function to
7244 raise the exception. */
7245 return unicode_encode_ucs1(unicode, errors, 256);
7246 }
7247
7248 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)7249 PyUnicode_AsLatin1String(PyObject *unicode)
7250 {
7251 return _PyUnicode_AsLatin1String(unicode, NULL);
7252 }
7253
7254 /* --- 7-bit ASCII Codec -------------------------------------------------- */
7255
7256 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)7257 PyUnicode_DecodeASCII(const char *s,
7258 Py_ssize_t size,
7259 const char *errors)
7260 {
7261 const char *starts = s;
7262 const char *e = s + size;
7263 PyObject *error_handler_obj = NULL;
7264 PyObject *exc = NULL;
7265 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7266
7267 if (size == 0)
7268 _Py_RETURN_UNICODE_EMPTY();
7269
7270 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7271 if (size == 1 && (unsigned char)s[0] < 128) {
7272 return get_latin1_char((unsigned char)s[0]);
7273 }
7274
7275 // Shortcut for simple case
7276 PyObject *u = PyUnicode_New(size, 127);
7277 if (u == NULL) {
7278 return NULL;
7279 }
7280 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7281 if (outpos == size) {
7282 return u;
7283 }
7284
7285 _PyUnicodeWriter writer;
7286 _PyUnicodeWriter_InitWithBuffer(&writer, u);
7287 writer.pos = outpos;
7288
7289 s += outpos;
7290 int kind = writer.kind;
7291 void *data = writer.data;
7292 Py_ssize_t startinpos, endinpos;
7293
7294 while (s < e) {
7295 unsigned char c = (unsigned char)*s;
7296 if (c < 128) {
7297 PyUnicode_WRITE(kind, data, writer.pos, c);
7298 writer.pos++;
7299 ++s;
7300 continue;
7301 }
7302
7303 /* byte outsize range 0x00..0x7f: call the error handler */
7304
7305 if (error_handler == _Py_ERROR_UNKNOWN)
7306 error_handler = _Py_GetErrorHandler(errors);
7307
7308 switch (error_handler)
7309 {
7310 case _Py_ERROR_REPLACE:
7311 case _Py_ERROR_SURROGATEESCAPE:
7312 /* Fast-path: the error handler only writes one character,
7313 but we may switch to UCS2 at the first write */
7314 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7315 goto onError;
7316 kind = writer.kind;
7317 data = writer.data;
7318
7319 if (error_handler == _Py_ERROR_REPLACE)
7320 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7321 else
7322 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7323 writer.pos++;
7324 ++s;
7325 break;
7326
7327 case _Py_ERROR_IGNORE:
7328 ++s;
7329 break;
7330
7331 default:
7332 startinpos = s-starts;
7333 endinpos = startinpos + 1;
7334 if (unicode_decode_call_errorhandler_writer(
7335 errors, &error_handler_obj,
7336 "ascii", "ordinal not in range(128)",
7337 &starts, &e, &startinpos, &endinpos, &exc, &s,
7338 &writer))
7339 goto onError;
7340 kind = writer.kind;
7341 data = writer.data;
7342 }
7343 }
7344 Py_XDECREF(error_handler_obj);
7345 Py_XDECREF(exc);
7346 return _PyUnicodeWriter_Finish(&writer);
7347
7348 onError:
7349 _PyUnicodeWriter_Dealloc(&writer);
7350 Py_XDECREF(error_handler_obj);
7351 Py_XDECREF(exc);
7352 return NULL;
7353 }
7354
7355 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7356 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7357 {
7358 if (!PyUnicode_Check(unicode)) {
7359 PyErr_BadArgument();
7360 return NULL;
7361 }
7362 if (PyUnicode_READY(unicode) == -1)
7363 return NULL;
7364 /* Fast path: if it is an ASCII-only string, construct bytes object
7365 directly. Else defer to above function to raise the exception. */
7366 if (PyUnicode_IS_ASCII(unicode))
7367 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7368 PyUnicode_GET_LENGTH(unicode));
7369 return unicode_encode_ucs1(unicode, errors, 128);
7370 }
7371
7372 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7373 PyUnicode_AsASCIIString(PyObject *unicode)
7374 {
7375 return _PyUnicode_AsASCIIString(unicode, NULL);
7376 }
7377
7378 #ifdef MS_WINDOWS
7379
7380 /* --- MBCS codecs for Windows -------------------------------------------- */
7381
7382 #if SIZEOF_INT < SIZEOF_SIZE_T
7383 #define NEED_RETRY
7384 #endif
7385
7386 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7387 transcoding from UTF-16), but INT_MAX / 4 performs better in
7388 both cases also and avoids partial characters overrunning the
7389 length limit in MultiByteToWideChar on Windows */
7390 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7391
7392 #ifndef WC_ERR_INVALID_CHARS
7393 # define WC_ERR_INVALID_CHARS 0x0080
7394 #endif
7395
7396 static const char*
code_page_name(UINT code_page,PyObject ** obj)7397 code_page_name(UINT code_page, PyObject **obj)
7398 {
7399 *obj = NULL;
7400 if (code_page == CP_ACP)
7401 return "mbcs";
7402 if (code_page == CP_UTF7)
7403 return "CP_UTF7";
7404 if (code_page == CP_UTF8)
7405 return "CP_UTF8";
7406
7407 *obj = PyBytes_FromFormat("cp%u", code_page);
7408 if (*obj == NULL)
7409 return NULL;
7410 return PyBytes_AS_STRING(*obj);
7411 }
7412
7413 static DWORD
decode_code_page_flags(UINT code_page)7414 decode_code_page_flags(UINT code_page)
7415 {
7416 if (code_page == CP_UTF7) {
7417 /* The CP_UTF7 decoder only supports flags=0 */
7418 return 0;
7419 }
7420 else
7421 return MB_ERR_INVALID_CHARS;
7422 }
7423
7424 /*
7425 * Decode a byte string from a Windows code page into unicode object in strict
7426 * mode.
7427 *
7428 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7429 * OSError and returns -1 on other error.
7430 */
7431 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7432 decode_code_page_strict(UINT code_page,
7433 wchar_t **buf,
7434 Py_ssize_t *bufsize,
7435 const char *in,
7436 int insize)
7437 {
7438 DWORD flags = MB_ERR_INVALID_CHARS;
7439 wchar_t *out;
7440 DWORD outsize;
7441
7442 /* First get the size of the result */
7443 assert(insize > 0);
7444 while ((outsize = MultiByteToWideChar(code_page, flags,
7445 in, insize, NULL, 0)) <= 0)
7446 {
7447 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7448 goto error;
7449 }
7450 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7451 flags = 0;
7452 }
7453
7454 /* Extend a wchar_t* buffer */
7455 Py_ssize_t n = *bufsize; /* Get the current length */
7456 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7457 return -1;
7458 }
7459 out = *buf + n;
7460
7461 /* Do the conversion */
7462 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7463 if (outsize <= 0)
7464 goto error;
7465 return insize;
7466
7467 error:
7468 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7469 return -2;
7470 PyErr_SetFromWindowsErr(0);
7471 return -1;
7472 }
7473
7474 /*
7475 * Decode a byte string from a code page into unicode object with an error
7476 * handler.
7477 *
7478 * Returns consumed size if succeed, or raise an OSError or
7479 * UnicodeDecodeError exception and returns -1 on error.
7480 */
7481 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7482 decode_code_page_errors(UINT code_page,
7483 wchar_t **buf,
7484 Py_ssize_t *bufsize,
7485 const char *in, const int size,
7486 const char *errors, int final)
7487 {
7488 const char *startin = in;
7489 const char *endin = in + size;
7490 DWORD flags = MB_ERR_INVALID_CHARS;
7491 /* Ideally, we should get reason from FormatMessage. This is the Windows
7492 2000 English version of the message. */
7493 const char *reason = "No mapping for the Unicode character exists "
7494 "in the target code page.";
7495 /* each step cannot decode more than 1 character, but a character can be
7496 represented as a surrogate pair */
7497 wchar_t buffer[2], *out;
7498 int insize;
7499 Py_ssize_t outsize;
7500 PyObject *errorHandler = NULL;
7501 PyObject *exc = NULL;
7502 PyObject *encoding_obj = NULL;
7503 const char *encoding;
7504 DWORD err;
7505 int ret = -1;
7506
7507 assert(size > 0);
7508
7509 encoding = code_page_name(code_page, &encoding_obj);
7510 if (encoding == NULL)
7511 return -1;
7512
7513 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7514 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7515 UnicodeDecodeError. */
7516 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7517 if (exc != NULL) {
7518 PyCodec_StrictErrors(exc);
7519 Py_CLEAR(exc);
7520 }
7521 goto error;
7522 }
7523
7524 /* Extend a wchar_t* buffer */
7525 Py_ssize_t n = *bufsize; /* Get the current length */
7526 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7527 PyErr_NoMemory();
7528 goto error;
7529 }
7530 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7531 goto error;
7532 }
7533 out = *buf + n;
7534
7535 /* Decode the byte string character per character */
7536 while (in < endin)
7537 {
7538 /* Decode a character */
7539 insize = 1;
7540 do
7541 {
7542 outsize = MultiByteToWideChar(code_page, flags,
7543 in, insize,
7544 buffer, Py_ARRAY_LENGTH(buffer));
7545 if (outsize > 0)
7546 break;
7547 err = GetLastError();
7548 if (err == ERROR_INVALID_FLAGS && flags) {
7549 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7550 flags = 0;
7551 continue;
7552 }
7553 if (err != ERROR_NO_UNICODE_TRANSLATION
7554 && err != ERROR_INSUFFICIENT_BUFFER)
7555 {
7556 PyErr_SetFromWindowsErr(0);
7557 goto error;
7558 }
7559 insize++;
7560 }
7561 /* 4=maximum length of a UTF-8 sequence */
7562 while (insize <= 4 && (in + insize) <= endin);
7563
7564 if (outsize <= 0) {
7565 Py_ssize_t startinpos, endinpos, outpos;
7566
7567 /* last character in partial decode? */
7568 if (in + insize >= endin && !final)
7569 break;
7570
7571 startinpos = in - startin;
7572 endinpos = startinpos + 1;
7573 outpos = out - *buf;
7574 if (unicode_decode_call_errorhandler_wchar(
7575 errors, &errorHandler,
7576 encoding, reason,
7577 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7578 buf, bufsize, &outpos))
7579 {
7580 goto error;
7581 }
7582 out = *buf + outpos;
7583 }
7584 else {
7585 in += insize;
7586 memcpy(out, buffer, outsize * sizeof(wchar_t));
7587 out += outsize;
7588 }
7589 }
7590
7591 /* Shrink the buffer */
7592 assert(out - *buf <= *bufsize);
7593 *bufsize = out - *buf;
7594 /* (in - startin) <= size and size is an int */
7595 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7596
7597 error:
7598 Py_XDECREF(encoding_obj);
7599 Py_XDECREF(errorHandler);
7600 Py_XDECREF(exc);
7601 return ret;
7602 }
7603
7604 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7605 decode_code_page_stateful(int code_page,
7606 const char *s, Py_ssize_t size,
7607 const char *errors, Py_ssize_t *consumed)
7608 {
7609 wchar_t *buf = NULL;
7610 Py_ssize_t bufsize = 0;
7611 int chunk_size, final, converted, done;
7612
7613 if (code_page < 0) {
7614 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7615 return NULL;
7616 }
7617 if (size < 0) {
7618 PyErr_BadInternalCall();
7619 return NULL;
7620 }
7621
7622 if (consumed)
7623 *consumed = 0;
7624
7625 do
7626 {
7627 #ifdef NEED_RETRY
7628 if (size > DECODING_CHUNK_SIZE) {
7629 chunk_size = DECODING_CHUNK_SIZE;
7630 final = 0;
7631 done = 0;
7632 }
7633 else
7634 #endif
7635 {
7636 chunk_size = (int)size;
7637 final = (consumed == NULL);
7638 done = 1;
7639 }
7640
7641 if (chunk_size == 0 && done) {
7642 if (buf != NULL)
7643 break;
7644 _Py_RETURN_UNICODE_EMPTY();
7645 }
7646
7647 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7648 s, chunk_size);
7649 if (converted == -2)
7650 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7651 s, chunk_size,
7652 errors, final);
7653 assert(converted != 0 || done);
7654
7655 if (converted < 0) {
7656 PyMem_Free(buf);
7657 return NULL;
7658 }
7659
7660 if (consumed)
7661 *consumed += converted;
7662
7663 s += converted;
7664 size -= converted;
7665 } while (!done);
7666
7667 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7668 PyMem_Free(buf);
7669 return v;
7670 }
7671
7672 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7673 PyUnicode_DecodeCodePageStateful(int code_page,
7674 const char *s,
7675 Py_ssize_t size,
7676 const char *errors,
7677 Py_ssize_t *consumed)
7678 {
7679 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7680 }
7681
7682 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7683 PyUnicode_DecodeMBCSStateful(const char *s,
7684 Py_ssize_t size,
7685 const char *errors,
7686 Py_ssize_t *consumed)
7687 {
7688 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7689 }
7690
7691 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7692 PyUnicode_DecodeMBCS(const char *s,
7693 Py_ssize_t size,
7694 const char *errors)
7695 {
7696 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7697 }
7698
7699 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7700 encode_code_page_flags(UINT code_page, const char *errors)
7701 {
7702 if (code_page == CP_UTF8) {
7703 return WC_ERR_INVALID_CHARS;
7704 }
7705 else if (code_page == CP_UTF7) {
7706 /* CP_UTF7 only supports flags=0 */
7707 return 0;
7708 }
7709 else {
7710 if (errors != NULL && strcmp(errors, "replace") == 0)
7711 return 0;
7712 else
7713 return WC_NO_BEST_FIT_CHARS;
7714 }
7715 }
7716
7717 /*
7718 * Encode a Unicode string to a Windows code page into a byte string in strict
7719 * mode.
7720 *
7721 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7722 * an OSError and returns -1 on other error.
7723 */
7724 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7725 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7726 PyObject *unicode, Py_ssize_t offset, int len,
7727 const char* errors)
7728 {
7729 BOOL usedDefaultChar = FALSE;
7730 BOOL *pusedDefaultChar = &usedDefaultChar;
7731 int outsize;
7732 wchar_t *p;
7733 Py_ssize_t size;
7734 const DWORD flags = encode_code_page_flags(code_page, NULL);
7735 char *out;
7736 /* Create a substring so that we can get the UTF-16 representation
7737 of just the slice under consideration. */
7738 PyObject *substring;
7739 int ret = -1;
7740
7741 assert(len > 0);
7742
7743 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7744 pusedDefaultChar = &usedDefaultChar;
7745 else
7746 pusedDefaultChar = NULL;
7747
7748 substring = PyUnicode_Substring(unicode, offset, offset+len);
7749 if (substring == NULL)
7750 return -1;
7751 #if USE_UNICODE_WCHAR_CACHE
7752 _Py_COMP_DIAG_PUSH
7753 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
7754 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7755 if (p == NULL) {
7756 Py_DECREF(substring);
7757 return -1;
7758 }
7759 _Py_COMP_DIAG_POP
7760 #else /* USE_UNICODE_WCHAR_CACHE */
7761 p = PyUnicode_AsWideCharString(substring, &size);
7762 Py_CLEAR(substring);
7763 if (p == NULL) {
7764 return -1;
7765 }
7766 #endif /* USE_UNICODE_WCHAR_CACHE */
7767 assert(size <= INT_MAX);
7768
7769 /* First get the size of the result */
7770 outsize = WideCharToMultiByte(code_page, flags,
7771 p, (int)size,
7772 NULL, 0,
7773 NULL, pusedDefaultChar);
7774 if (outsize <= 0)
7775 goto error;
7776 /* If we used a default char, then we failed! */
7777 if (pusedDefaultChar && *pusedDefaultChar) {
7778 ret = -2;
7779 goto done;
7780 }
7781
7782 if (*outbytes == NULL) {
7783 /* Create string object */
7784 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7785 if (*outbytes == NULL) {
7786 goto done;
7787 }
7788 out = PyBytes_AS_STRING(*outbytes);
7789 }
7790 else {
7791 /* Extend string object */
7792 const Py_ssize_t n = PyBytes_Size(*outbytes);
7793 if (outsize > PY_SSIZE_T_MAX - n) {
7794 PyErr_NoMemory();
7795 goto done;
7796 }
7797 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7798 goto done;
7799 }
7800 out = PyBytes_AS_STRING(*outbytes) + n;
7801 }
7802
7803 /* Do the conversion */
7804 outsize = WideCharToMultiByte(code_page, flags,
7805 p, (int)size,
7806 out, outsize,
7807 NULL, pusedDefaultChar);
7808 if (outsize <= 0)
7809 goto error;
7810 if (pusedDefaultChar && *pusedDefaultChar) {
7811 ret = -2;
7812 goto done;
7813 }
7814 ret = 0;
7815
7816 done:
7817 #if USE_UNICODE_WCHAR_CACHE
7818 Py_DECREF(substring);
7819 #else /* USE_UNICODE_WCHAR_CACHE */
7820 PyMem_Free(p);
7821 #endif /* USE_UNICODE_WCHAR_CACHE */
7822 return ret;
7823
7824 error:
7825 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7826 ret = -2;
7827 goto done;
7828 }
7829 PyErr_SetFromWindowsErr(0);
7830 goto done;
7831 }
7832
7833 /*
7834 * Encode a Unicode string to a Windows code page into a byte string using an
7835 * error handler.
7836 *
7837 * Returns consumed characters if succeed, or raise an OSError and returns
7838 * -1 on other error.
7839 */
7840 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7841 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7842 PyObject *unicode, Py_ssize_t unicode_offset,
7843 Py_ssize_t insize, const char* errors)
7844 {
7845 const DWORD flags = encode_code_page_flags(code_page, errors);
7846 Py_ssize_t pos = unicode_offset;
7847 Py_ssize_t endin = unicode_offset + insize;
7848 /* Ideally, we should get reason from FormatMessage. This is the Windows
7849 2000 English version of the message. */
7850 const char *reason = "invalid character";
7851 /* 4=maximum length of a UTF-8 sequence */
7852 char buffer[4];
7853 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7854 Py_ssize_t outsize;
7855 char *out;
7856 PyObject *errorHandler = NULL;
7857 PyObject *exc = NULL;
7858 PyObject *encoding_obj = NULL;
7859 const char *encoding;
7860 Py_ssize_t newpos, newoutsize;
7861 PyObject *rep;
7862 int ret = -1;
7863
7864 assert(insize > 0);
7865
7866 encoding = code_page_name(code_page, &encoding_obj);
7867 if (encoding == NULL)
7868 return -1;
7869
7870 if (errors == NULL || strcmp(errors, "strict") == 0) {
7871 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7872 then we raise a UnicodeEncodeError. */
7873 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7874 if (exc != NULL) {
7875 PyCodec_StrictErrors(exc);
7876 Py_DECREF(exc);
7877 }
7878 Py_XDECREF(encoding_obj);
7879 return -1;
7880 }
7881
7882 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7883 pusedDefaultChar = &usedDefaultChar;
7884 else
7885 pusedDefaultChar = NULL;
7886
7887 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7888 PyErr_NoMemory();
7889 goto error;
7890 }
7891 outsize = insize * Py_ARRAY_LENGTH(buffer);
7892
7893 if (*outbytes == NULL) {
7894 /* Create string object */
7895 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7896 if (*outbytes == NULL)
7897 goto error;
7898 out = PyBytes_AS_STRING(*outbytes);
7899 }
7900 else {
7901 /* Extend string object */
7902 Py_ssize_t n = PyBytes_Size(*outbytes);
7903 if (n > PY_SSIZE_T_MAX - outsize) {
7904 PyErr_NoMemory();
7905 goto error;
7906 }
7907 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7908 goto error;
7909 out = PyBytes_AS_STRING(*outbytes) + n;
7910 }
7911
7912 /* Encode the string character per character */
7913 while (pos < endin)
7914 {
7915 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7916 wchar_t chars[2];
7917 int charsize;
7918 if (ch < 0x10000) {
7919 chars[0] = (wchar_t)ch;
7920 charsize = 1;
7921 }
7922 else {
7923 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7924 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7925 charsize = 2;
7926 }
7927
7928 outsize = WideCharToMultiByte(code_page, flags,
7929 chars, charsize,
7930 buffer, Py_ARRAY_LENGTH(buffer),
7931 NULL, pusedDefaultChar);
7932 if (outsize > 0) {
7933 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7934 {
7935 pos++;
7936 memcpy(out, buffer, outsize);
7937 out += outsize;
7938 continue;
7939 }
7940 }
7941 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7942 PyErr_SetFromWindowsErr(0);
7943 goto error;
7944 }
7945
7946 rep = unicode_encode_call_errorhandler(
7947 errors, &errorHandler, encoding, reason,
7948 unicode, &exc,
7949 pos, pos + 1, &newpos);
7950 if (rep == NULL)
7951 goto error;
7952
7953 Py_ssize_t morebytes = pos - newpos;
7954 if (PyBytes_Check(rep)) {
7955 outsize = PyBytes_GET_SIZE(rep);
7956 morebytes += outsize;
7957 if (morebytes > 0) {
7958 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7959 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7960 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7961 Py_DECREF(rep);
7962 goto error;
7963 }
7964 out = PyBytes_AS_STRING(*outbytes) + offset;
7965 }
7966 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7967 out += outsize;
7968 }
7969 else {
7970 Py_ssize_t i;
7971 enum PyUnicode_Kind kind;
7972 const void *data;
7973
7974 if (PyUnicode_READY(rep) == -1) {
7975 Py_DECREF(rep);
7976 goto error;
7977 }
7978
7979 outsize = PyUnicode_GET_LENGTH(rep);
7980 morebytes += outsize;
7981 if (morebytes > 0) {
7982 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7983 newoutsize = PyBytes_GET_SIZE(*outbytes) + morebytes;
7984 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7985 Py_DECREF(rep);
7986 goto error;
7987 }
7988 out = PyBytes_AS_STRING(*outbytes) + offset;
7989 }
7990 kind = PyUnicode_KIND(rep);
7991 data = PyUnicode_DATA(rep);
7992 for (i=0; i < outsize; i++) {
7993 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7994 if (ch > 127) {
7995 raise_encode_exception(&exc,
7996 encoding, unicode,
7997 pos, pos + 1,
7998 "unable to encode error handler result to ASCII");
7999 Py_DECREF(rep);
8000 goto error;
8001 }
8002 *out = (unsigned char)ch;
8003 out++;
8004 }
8005 }
8006 pos = newpos;
8007 Py_DECREF(rep);
8008 }
8009 /* write a NUL byte */
8010 *out = 0;
8011 outsize = out - PyBytes_AS_STRING(*outbytes);
8012 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8013 if (_PyBytes_Resize(outbytes, outsize) < 0)
8014 goto error;
8015 ret = 0;
8016
8017 error:
8018 Py_XDECREF(encoding_obj);
8019 Py_XDECREF(errorHandler);
8020 Py_XDECREF(exc);
8021 return ret;
8022 }
8023
8024 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)8025 encode_code_page(int code_page,
8026 PyObject *unicode,
8027 const char *errors)
8028 {
8029 Py_ssize_t len;
8030 PyObject *outbytes = NULL;
8031 Py_ssize_t offset;
8032 int chunk_len, ret, done;
8033
8034 if (!PyUnicode_Check(unicode)) {
8035 PyErr_BadArgument();
8036 return NULL;
8037 }
8038
8039 if (PyUnicode_READY(unicode) == -1)
8040 return NULL;
8041 len = PyUnicode_GET_LENGTH(unicode);
8042
8043 if (code_page < 0) {
8044 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8045 return NULL;
8046 }
8047
8048 if (len == 0)
8049 return PyBytes_FromStringAndSize(NULL, 0);
8050
8051 offset = 0;
8052 do
8053 {
8054 #ifdef NEED_RETRY
8055 if (len > DECODING_CHUNK_SIZE) {
8056 chunk_len = DECODING_CHUNK_SIZE;
8057 done = 0;
8058 }
8059 else
8060 #endif
8061 {
8062 chunk_len = (int)len;
8063 done = 1;
8064 }
8065
8066 ret = encode_code_page_strict(code_page, &outbytes,
8067 unicode, offset, chunk_len,
8068 errors);
8069 if (ret == -2)
8070 ret = encode_code_page_errors(code_page, &outbytes,
8071 unicode, offset,
8072 chunk_len, errors);
8073 if (ret < 0) {
8074 Py_XDECREF(outbytes);
8075 return NULL;
8076 }
8077
8078 offset += chunk_len;
8079 len -= chunk_len;
8080 } while (!done);
8081
8082 return outbytes;
8083 }
8084
8085 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)8086 PyUnicode_EncodeCodePage(int code_page,
8087 PyObject *unicode,
8088 const char *errors)
8089 {
8090 return encode_code_page(code_page, unicode, errors);
8091 }
8092
8093 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)8094 PyUnicode_AsMBCSString(PyObject *unicode)
8095 {
8096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8097 }
8098
8099 #undef NEED_RETRY
8100
8101 #endif /* MS_WINDOWS */
8102
8103 /* --- Character Mapping Codec -------------------------------------------- */
8104
8105 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8106 charmap_decode_string(const char *s,
8107 Py_ssize_t size,
8108 PyObject *mapping,
8109 const char *errors,
8110 _PyUnicodeWriter *writer)
8111 {
8112 const char *starts = s;
8113 const char *e;
8114 Py_ssize_t startinpos, endinpos;
8115 PyObject *errorHandler = NULL, *exc = NULL;
8116 Py_ssize_t maplen;
8117 enum PyUnicode_Kind mapkind;
8118 const void *mapdata;
8119 Py_UCS4 x;
8120 unsigned char ch;
8121
8122 if (PyUnicode_READY(mapping) == -1)
8123 return -1;
8124
8125 maplen = PyUnicode_GET_LENGTH(mapping);
8126 mapdata = PyUnicode_DATA(mapping);
8127 mapkind = PyUnicode_KIND(mapping);
8128
8129 e = s + size;
8130
8131 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8132 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8133 * is disabled in encoding aliases, latin1 is preferred because
8134 * its implementation is faster. */
8135 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8136 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8137 Py_UCS4 maxchar = writer->maxchar;
8138
8139 assert (writer->kind == PyUnicode_1BYTE_KIND);
8140 while (s < e) {
8141 ch = *s;
8142 x = mapdata_ucs1[ch];
8143 if (x > maxchar) {
8144 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8145 goto onError;
8146 maxchar = writer->maxchar;
8147 outdata = (Py_UCS1 *)writer->data;
8148 }
8149 outdata[writer->pos] = x;
8150 writer->pos++;
8151 ++s;
8152 }
8153 return 0;
8154 }
8155
8156 while (s < e) {
8157 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8158 enum PyUnicode_Kind outkind = writer->kind;
8159 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8160 if (outkind == PyUnicode_1BYTE_KIND) {
8161 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8162 Py_UCS4 maxchar = writer->maxchar;
8163 while (s < e) {
8164 ch = *s;
8165 x = mapdata_ucs2[ch];
8166 if (x > maxchar)
8167 goto Error;
8168 outdata[writer->pos] = x;
8169 writer->pos++;
8170 ++s;
8171 }
8172 break;
8173 }
8174 else if (outkind == PyUnicode_2BYTE_KIND) {
8175 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8176 while (s < e) {
8177 ch = *s;
8178 x = mapdata_ucs2[ch];
8179 if (x == 0xFFFE)
8180 goto Error;
8181 outdata[writer->pos] = x;
8182 writer->pos++;
8183 ++s;
8184 }
8185 break;
8186 }
8187 }
8188 ch = *s;
8189
8190 if (ch < maplen)
8191 x = PyUnicode_READ(mapkind, mapdata, ch);
8192 else
8193 x = 0xfffe; /* invalid value */
8194 Error:
8195 if (x == 0xfffe)
8196 {
8197 /* undefined mapping */
8198 startinpos = s-starts;
8199 endinpos = startinpos+1;
8200 if (unicode_decode_call_errorhandler_writer(
8201 errors, &errorHandler,
8202 "charmap", "character maps to <undefined>",
8203 &starts, &e, &startinpos, &endinpos, &exc, &s,
8204 writer)) {
8205 goto onError;
8206 }
8207 continue;
8208 }
8209
8210 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8211 goto onError;
8212 ++s;
8213 }
8214 Py_XDECREF(errorHandler);
8215 Py_XDECREF(exc);
8216 return 0;
8217
8218 onError:
8219 Py_XDECREF(errorHandler);
8220 Py_XDECREF(exc);
8221 return -1;
8222 }
8223
8224 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)8225 charmap_decode_mapping(const char *s,
8226 Py_ssize_t size,
8227 PyObject *mapping,
8228 const char *errors,
8229 _PyUnicodeWriter *writer)
8230 {
8231 const char *starts = s;
8232 const char *e;
8233 Py_ssize_t startinpos, endinpos;
8234 PyObject *errorHandler = NULL, *exc = NULL;
8235 unsigned char ch;
8236 PyObject *key, *item = NULL;
8237
8238 e = s + size;
8239
8240 while (s < e) {
8241 ch = *s;
8242
8243 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8244 key = PyLong_FromLong((long)ch);
8245 if (key == NULL)
8246 goto onError;
8247
8248 item = PyObject_GetItem(mapping, key);
8249 Py_DECREF(key);
8250 if (item == NULL) {
8251 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8252 /* No mapping found means: mapping is undefined. */
8253 PyErr_Clear();
8254 goto Undefined;
8255 } else
8256 goto onError;
8257 }
8258
8259 /* Apply mapping */
8260 if (item == Py_None)
8261 goto Undefined;
8262 if (PyLong_Check(item)) {
8263 long value = PyLong_AS_LONG(item);
8264 if (value == 0xFFFE)
8265 goto Undefined;
8266 if (value < 0 || value > MAX_UNICODE) {
8267 PyErr_Format(PyExc_TypeError,
8268 "character mapping must be in range(0x%x)",
8269 (unsigned long)MAX_UNICODE + 1);
8270 goto onError;
8271 }
8272
8273 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8274 goto onError;
8275 }
8276 else if (PyUnicode_Check(item)) {
8277 if (PyUnicode_READY(item) == -1)
8278 goto onError;
8279 if (PyUnicode_GET_LENGTH(item) == 1) {
8280 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8281 if (value == 0xFFFE)
8282 goto Undefined;
8283 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8284 goto onError;
8285 }
8286 else {
8287 writer->overallocate = 1;
8288 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8289 goto onError;
8290 }
8291 }
8292 else {
8293 /* wrong return value */
8294 PyErr_SetString(PyExc_TypeError,
8295 "character mapping must return integer, None or str");
8296 goto onError;
8297 }
8298 Py_CLEAR(item);
8299 ++s;
8300 continue;
8301
8302 Undefined:
8303 /* undefined mapping */
8304 Py_CLEAR(item);
8305 startinpos = s-starts;
8306 endinpos = startinpos+1;
8307 if (unicode_decode_call_errorhandler_writer(
8308 errors, &errorHandler,
8309 "charmap", "character maps to <undefined>",
8310 &starts, &e, &startinpos, &endinpos, &exc, &s,
8311 writer)) {
8312 goto onError;
8313 }
8314 }
8315 Py_XDECREF(errorHandler);
8316 Py_XDECREF(exc);
8317 return 0;
8318
8319 onError:
8320 Py_XDECREF(item);
8321 Py_XDECREF(errorHandler);
8322 Py_XDECREF(exc);
8323 return -1;
8324 }
8325
8326 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8327 PyUnicode_DecodeCharmap(const char *s,
8328 Py_ssize_t size,
8329 PyObject *mapping,
8330 const char *errors)
8331 {
8332 _PyUnicodeWriter writer;
8333
8334 /* Default to Latin-1 */
8335 if (mapping == NULL)
8336 return PyUnicode_DecodeLatin1(s, size, errors);
8337
8338 if (size == 0)
8339 _Py_RETURN_UNICODE_EMPTY();
8340 _PyUnicodeWriter_Init(&writer);
8341 writer.min_length = size;
8342 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8343 goto onError;
8344
8345 if (PyUnicode_CheckExact(mapping)) {
8346 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8347 goto onError;
8348 }
8349 else {
8350 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8351 goto onError;
8352 }
8353 return _PyUnicodeWriter_Finish(&writer);
8354
8355 onError:
8356 _PyUnicodeWriter_Dealloc(&writer);
8357 return NULL;
8358 }
8359
8360 /* Charmap encoding: the lookup table */
8361
8362 /*[clinic input]
8363 class EncodingMap "struct encoding_map *" "&EncodingMapType"
8364 [clinic start generated code]*/
8365 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8366
8367 struct encoding_map {
8368 PyObject_HEAD
8369 unsigned char level1[32];
8370 int count2, count3;
8371 unsigned char level23[1];
8372 };
8373
8374 /*[clinic input]
8375 EncodingMap.size
8376
8377 Return the size (in bytes) of this object.
8378 [clinic start generated code]*/
8379
8380 static PyObject *
EncodingMap_size_impl(struct encoding_map * self)8381 EncodingMap_size_impl(struct encoding_map *self)
8382 /*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8383 {
8384 return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8385 128*self->count3);
8386 }
8387
8388 static PyMethodDef encoding_map_methods[] = {
8389 ENCODINGMAP_SIZE_METHODDEF
8390 {NULL, NULL}
8391 };
8392
8393 static PyTypeObject EncodingMapType = {
8394 PyVarObject_HEAD_INIT(NULL, 0)
8395 .tp_name = "EncodingMap",
8396 .tp_basicsize = sizeof(struct encoding_map),
8397 /* methods */
8398 .tp_flags = Py_TPFLAGS_DEFAULT,
8399 .tp_methods = encoding_map_methods,
8400 };
8401
8402 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8403 PyUnicode_BuildEncodingMap(PyObject* string)
8404 {
8405 PyObject *result;
8406 struct encoding_map *mresult;
8407 int i;
8408 int need_dict = 0;
8409 unsigned char level1[32];
8410 unsigned char level2[512];
8411 unsigned char *mlevel1, *mlevel2, *mlevel3;
8412 int count2 = 0, count3 = 0;
8413 int kind;
8414 const void *data;
8415 Py_ssize_t length;
8416 Py_UCS4 ch;
8417
8418 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8419 PyErr_BadArgument();
8420 return NULL;
8421 }
8422 kind = PyUnicode_KIND(string);
8423 data = PyUnicode_DATA(string);
8424 length = PyUnicode_GET_LENGTH(string);
8425 length = Py_MIN(length, 256);
8426 memset(level1, 0xFF, sizeof level1);
8427 memset(level2, 0xFF, sizeof level2);
8428
8429 /* If there isn't a one-to-one mapping of NULL to \0,
8430 or if there are non-BMP characters, we need to use
8431 a mapping dictionary. */
8432 if (PyUnicode_READ(kind, data, 0) != 0)
8433 need_dict = 1;
8434 for (i = 1; i < length; i++) {
8435 int l1, l2;
8436 ch = PyUnicode_READ(kind, data, i);
8437 if (ch == 0 || ch > 0xFFFF) {
8438 need_dict = 1;
8439 break;
8440 }
8441 if (ch == 0xFFFE)
8442 /* unmapped character */
8443 continue;
8444 l1 = ch >> 11;
8445 l2 = ch >> 7;
8446 if (level1[l1] == 0xFF)
8447 level1[l1] = count2++;
8448 if (level2[l2] == 0xFF)
8449 level2[l2] = count3++;
8450 }
8451
8452 if (count2 >= 0xFF || count3 >= 0xFF)
8453 need_dict = 1;
8454
8455 if (need_dict) {
8456 PyObject *result = PyDict_New();
8457 PyObject *key, *value;
8458 if (!result)
8459 return NULL;
8460 for (i = 0; i < length; i++) {
8461 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8462 value = PyLong_FromLong(i);
8463 if (!key || !value)
8464 goto failed1;
8465 if (PyDict_SetItem(result, key, value) == -1)
8466 goto failed1;
8467 Py_DECREF(key);
8468 Py_DECREF(value);
8469 }
8470 return result;
8471 failed1:
8472 Py_XDECREF(key);
8473 Py_XDECREF(value);
8474 Py_DECREF(result);
8475 return NULL;
8476 }
8477
8478 /* Create a three-level trie */
8479 result = PyObject_Malloc(sizeof(struct encoding_map) +
8480 16*count2 + 128*count3 - 1);
8481 if (!result) {
8482 return PyErr_NoMemory();
8483 }
8484
8485 _PyObject_Init(result, &EncodingMapType);
8486 mresult = (struct encoding_map*)result;
8487 mresult->count2 = count2;
8488 mresult->count3 = count3;
8489 mlevel1 = mresult->level1;
8490 mlevel2 = mresult->level23;
8491 mlevel3 = mresult->level23 + 16*count2;
8492 memcpy(mlevel1, level1, 32);
8493 memset(mlevel2, 0xFF, 16*count2);
8494 memset(mlevel3, 0, 128*count3);
8495 count3 = 0;
8496 for (i = 1; i < length; i++) {
8497 int o1, o2, o3, i2, i3;
8498 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8499 if (ch == 0xFFFE)
8500 /* unmapped character */
8501 continue;
8502 o1 = ch>>11;
8503 o2 = (ch>>7) & 0xF;
8504 i2 = 16*mlevel1[o1] + o2;
8505 if (mlevel2[i2] == 0xFF)
8506 mlevel2[i2] = count3++;
8507 o3 = ch & 0x7F;
8508 i3 = 128*mlevel2[i2] + o3;
8509 mlevel3[i3] = i;
8510 }
8511 return result;
8512 }
8513
8514 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8515 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8516 {
8517 struct encoding_map *map = (struct encoding_map*)mapping;
8518 int l1 = c>>11;
8519 int l2 = (c>>7) & 0xF;
8520 int l3 = c & 0x7F;
8521 int i;
8522
8523 if (c > 0xFFFF)
8524 return -1;
8525 if (c == 0)
8526 return 0;
8527 /* level 1*/
8528 i = map->level1[l1];
8529 if (i == 0xFF) {
8530 return -1;
8531 }
8532 /* level 2*/
8533 i = map->level23[16*i+l2];
8534 if (i == 0xFF) {
8535 return -1;
8536 }
8537 /* level 3 */
8538 i = map->level23[16*map->count2 + 128*i + l3];
8539 if (i == 0) {
8540 return -1;
8541 }
8542 return i;
8543 }
8544
8545 /* Lookup the character ch in the mapping. If the character
8546 can't be found, Py_None is returned (or NULL, if another
8547 error occurred). */
8548 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8549 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8550 {
8551 PyObject *w = PyLong_FromLong((long)c);
8552 PyObject *x;
8553
8554 if (w == NULL)
8555 return NULL;
8556 x = PyObject_GetItem(mapping, w);
8557 Py_DECREF(w);
8558 if (x == NULL) {
8559 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8560 /* No mapping found means: mapping is undefined. */
8561 PyErr_Clear();
8562 Py_RETURN_NONE;
8563 } else
8564 return NULL;
8565 }
8566 else if (x == Py_None)
8567 return x;
8568 else if (PyLong_Check(x)) {
8569 long value = PyLong_AS_LONG(x);
8570 if (value < 0 || value > 255) {
8571 PyErr_SetString(PyExc_TypeError,
8572 "character mapping must be in range(256)");
8573 Py_DECREF(x);
8574 return NULL;
8575 }
8576 return x;
8577 }
8578 else if (PyBytes_Check(x))
8579 return x;
8580 else {
8581 /* wrong return value */
8582 PyErr_Format(PyExc_TypeError,
8583 "character mapping must return integer, bytes or None, not %.400s",
8584 Py_TYPE(x)->tp_name);
8585 Py_DECREF(x);
8586 return NULL;
8587 }
8588 }
8589
8590 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8591 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8592 {
8593 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8594 /* exponentially overallocate to minimize reallocations */
8595 if (requiredsize < 2*outsize)
8596 requiredsize = 2*outsize;
8597 if (_PyBytes_Resize(outobj, requiredsize))
8598 return -1;
8599 return 0;
8600 }
8601
8602 typedef enum charmapencode_result {
8603 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8604 } charmapencode_result;
8605 /* lookup the character, put the result in the output string and adjust
8606 various state variables. Resize the output bytes object if not enough
8607 space is available. Return a new reference to the object that
8608 was put in the output buffer, or Py_None, if the mapping was undefined
8609 (in which case no character was written) or NULL, if a
8610 reallocation error occurred. The caller must decref the result */
8611 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8612 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8613 PyObject **outobj, Py_ssize_t *outpos)
8614 {
8615 PyObject *rep;
8616 char *outstart;
8617 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8618
8619 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8620 int res = encoding_map_lookup(c, mapping);
8621 Py_ssize_t requiredsize = *outpos+1;
8622 if (res == -1)
8623 return enc_FAILED;
8624 if (outsize<requiredsize)
8625 if (charmapencode_resize(outobj, outpos, requiredsize))
8626 return enc_EXCEPTION;
8627 outstart = PyBytes_AS_STRING(*outobj);
8628 outstart[(*outpos)++] = (char)res;
8629 return enc_SUCCESS;
8630 }
8631
8632 rep = charmapencode_lookup(c, mapping);
8633 if (rep==NULL)
8634 return enc_EXCEPTION;
8635 else if (rep==Py_None) {
8636 Py_DECREF(rep);
8637 return enc_FAILED;
8638 } else {
8639 if (PyLong_Check(rep)) {
8640 Py_ssize_t requiredsize = *outpos+1;
8641 if (outsize<requiredsize)
8642 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8643 Py_DECREF(rep);
8644 return enc_EXCEPTION;
8645 }
8646 outstart = PyBytes_AS_STRING(*outobj);
8647 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8648 }
8649 else {
8650 const char *repchars = PyBytes_AS_STRING(rep);
8651 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8652 Py_ssize_t requiredsize = *outpos+repsize;
8653 if (outsize<requiredsize)
8654 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8655 Py_DECREF(rep);
8656 return enc_EXCEPTION;
8657 }
8658 outstart = PyBytes_AS_STRING(*outobj);
8659 memcpy(outstart + *outpos, repchars, repsize);
8660 *outpos += repsize;
8661 }
8662 }
8663 Py_DECREF(rep);
8664 return enc_SUCCESS;
8665 }
8666
8667 /* handle an error in PyUnicode_EncodeCharmap
8668 Return 0 on success, -1 on error */
8669 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8670 charmap_encoding_error(
8671 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8672 PyObject **exceptionObject,
8673 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8674 PyObject **res, Py_ssize_t *respos)
8675 {
8676 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8677 Py_ssize_t size, repsize;
8678 Py_ssize_t newpos;
8679 enum PyUnicode_Kind kind;
8680 const void *data;
8681 Py_ssize_t index;
8682 /* startpos for collecting unencodable chars */
8683 Py_ssize_t collstartpos = *inpos;
8684 Py_ssize_t collendpos = *inpos+1;
8685 Py_ssize_t collpos;
8686 const char *encoding = "charmap";
8687 const char *reason = "character maps to <undefined>";
8688 charmapencode_result x;
8689 Py_UCS4 ch;
8690 int val;
8691
8692 if (PyUnicode_READY(unicode) == -1)
8693 return -1;
8694 size = PyUnicode_GET_LENGTH(unicode);
8695 /* find all unencodable characters */
8696 while (collendpos < size) {
8697 PyObject *rep;
8698 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8699 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8700 val = encoding_map_lookup(ch, mapping);
8701 if (val != -1)
8702 break;
8703 ++collendpos;
8704 continue;
8705 }
8706
8707 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8708 rep = charmapencode_lookup(ch, mapping);
8709 if (rep==NULL)
8710 return -1;
8711 else if (rep!=Py_None) {
8712 Py_DECREF(rep);
8713 break;
8714 }
8715 Py_DECREF(rep);
8716 ++collendpos;
8717 }
8718 /* cache callback name lookup
8719 * (if not done yet, i.e. it's the first error) */
8720 if (*error_handler == _Py_ERROR_UNKNOWN)
8721 *error_handler = _Py_GetErrorHandler(errors);
8722
8723 switch (*error_handler) {
8724 case _Py_ERROR_STRICT:
8725 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8726 return -1;
8727
8728 case _Py_ERROR_REPLACE:
8729 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8730 x = charmapencode_output('?', mapping, res, respos);
8731 if (x==enc_EXCEPTION) {
8732 return -1;
8733 }
8734 else if (x==enc_FAILED) {
8735 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8736 return -1;
8737 }
8738 }
8739 /* fall through */
8740 case _Py_ERROR_IGNORE:
8741 *inpos = collendpos;
8742 break;
8743
8744 case _Py_ERROR_XMLCHARREFREPLACE:
8745 /* generate replacement (temporarily (mis)uses p) */
8746 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8747 char buffer[2+29+1+1];
8748 char *cp;
8749 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8750 for (cp = buffer; *cp; ++cp) {
8751 x = charmapencode_output(*cp, mapping, res, respos);
8752 if (x==enc_EXCEPTION)
8753 return -1;
8754 else if (x==enc_FAILED) {
8755 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8756 return -1;
8757 }
8758 }
8759 }
8760 *inpos = collendpos;
8761 break;
8762
8763 default:
8764 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8765 encoding, reason, unicode, exceptionObject,
8766 collstartpos, collendpos, &newpos);
8767 if (repunicode == NULL)
8768 return -1;
8769 if (PyBytes_Check(repunicode)) {
8770 /* Directly copy bytes result to output. */
8771 Py_ssize_t outsize = PyBytes_Size(*res);
8772 Py_ssize_t requiredsize;
8773 repsize = PyBytes_Size(repunicode);
8774 requiredsize = *respos + repsize;
8775 if (requiredsize > outsize)
8776 /* Make room for all additional bytes. */
8777 if (charmapencode_resize(res, respos, requiredsize)) {
8778 Py_DECREF(repunicode);
8779 return -1;
8780 }
8781 memcpy(PyBytes_AsString(*res) + *respos,
8782 PyBytes_AsString(repunicode), repsize);
8783 *respos += repsize;
8784 *inpos = newpos;
8785 Py_DECREF(repunicode);
8786 break;
8787 }
8788 /* generate replacement */
8789 if (PyUnicode_READY(repunicode) == -1) {
8790 Py_DECREF(repunicode);
8791 return -1;
8792 }
8793 repsize = PyUnicode_GET_LENGTH(repunicode);
8794 data = PyUnicode_DATA(repunicode);
8795 kind = PyUnicode_KIND(repunicode);
8796 for (index = 0; index < repsize; index++) {
8797 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8798 x = charmapencode_output(repch, mapping, res, respos);
8799 if (x==enc_EXCEPTION) {
8800 Py_DECREF(repunicode);
8801 return -1;
8802 }
8803 else if (x==enc_FAILED) {
8804 Py_DECREF(repunicode);
8805 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8806 return -1;
8807 }
8808 }
8809 *inpos = newpos;
8810 Py_DECREF(repunicode);
8811 }
8812 return 0;
8813 }
8814
8815 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8816 _PyUnicode_EncodeCharmap(PyObject *unicode,
8817 PyObject *mapping,
8818 const char *errors)
8819 {
8820 /* output object */
8821 PyObject *res = NULL;
8822 /* current input position */
8823 Py_ssize_t inpos = 0;
8824 Py_ssize_t size;
8825 /* current output position */
8826 Py_ssize_t respos = 0;
8827 PyObject *error_handler_obj = NULL;
8828 PyObject *exc = NULL;
8829 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8830 const void *data;
8831 int kind;
8832
8833 if (PyUnicode_READY(unicode) == -1)
8834 return NULL;
8835 size = PyUnicode_GET_LENGTH(unicode);
8836 data = PyUnicode_DATA(unicode);
8837 kind = PyUnicode_KIND(unicode);
8838
8839 /* Default to Latin-1 */
8840 if (mapping == NULL)
8841 return unicode_encode_ucs1(unicode, errors, 256);
8842
8843 /* allocate enough for a simple encoding without
8844 replacements, if we need more, we'll resize */
8845 res = PyBytes_FromStringAndSize(NULL, size);
8846 if (res == NULL)
8847 goto onError;
8848 if (size == 0)
8849 return res;
8850
8851 while (inpos<size) {
8852 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8853 /* try to encode it */
8854 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8855 if (x==enc_EXCEPTION) /* error */
8856 goto onError;
8857 if (x==enc_FAILED) { /* unencodable character */
8858 if (charmap_encoding_error(unicode, &inpos, mapping,
8859 &exc,
8860 &error_handler, &error_handler_obj, errors,
8861 &res, &respos)) {
8862 goto onError;
8863 }
8864 }
8865 else
8866 /* done with this character => adjust input position */
8867 ++inpos;
8868 }
8869
8870 /* Resize if we allocated to much */
8871 if (respos<PyBytes_GET_SIZE(res))
8872 if (_PyBytes_Resize(&res, respos) < 0)
8873 goto onError;
8874
8875 Py_XDECREF(exc);
8876 Py_XDECREF(error_handler_obj);
8877 return res;
8878
8879 onError:
8880 Py_XDECREF(res);
8881 Py_XDECREF(exc);
8882 Py_XDECREF(error_handler_obj);
8883 return NULL;
8884 }
8885
8886 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8887 PyUnicode_AsCharmapString(PyObject *unicode,
8888 PyObject *mapping)
8889 {
8890 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8891 PyErr_BadArgument();
8892 return NULL;
8893 }
8894 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8895 }
8896
8897 /* create or adjust a UnicodeTranslateError */
8898 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8899 make_translate_exception(PyObject **exceptionObject,
8900 PyObject *unicode,
8901 Py_ssize_t startpos, Py_ssize_t endpos,
8902 const char *reason)
8903 {
8904 if (*exceptionObject == NULL) {
8905 *exceptionObject = _PyUnicodeTranslateError_Create(
8906 unicode, startpos, endpos, reason);
8907 }
8908 else {
8909 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8910 goto onError;
8911 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8912 goto onError;
8913 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8914 goto onError;
8915 return;
8916 onError:
8917 Py_CLEAR(*exceptionObject);
8918 }
8919 }
8920
8921 /* error handling callback helper:
8922 build arguments, call the callback and check the arguments,
8923 put the result into newpos and return the replacement string, which
8924 has to be freed by the caller */
8925 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8926 unicode_translate_call_errorhandler(const char *errors,
8927 PyObject **errorHandler,
8928 const char *reason,
8929 PyObject *unicode, PyObject **exceptionObject,
8930 Py_ssize_t startpos, Py_ssize_t endpos,
8931 Py_ssize_t *newpos)
8932 {
8933 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8934
8935 Py_ssize_t i_newpos;
8936 PyObject *restuple;
8937 PyObject *resunicode;
8938
8939 if (*errorHandler == NULL) {
8940 *errorHandler = PyCodec_LookupError(errors);
8941 if (*errorHandler == NULL)
8942 return NULL;
8943 }
8944
8945 make_translate_exception(exceptionObject,
8946 unicode, startpos, endpos, reason);
8947 if (*exceptionObject == NULL)
8948 return NULL;
8949
8950 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
8951 if (restuple == NULL)
8952 return NULL;
8953 if (!PyTuple_Check(restuple)) {
8954 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8955 Py_DECREF(restuple);
8956 return NULL;
8957 }
8958 if (!PyArg_ParseTuple(restuple, argparse,
8959 &resunicode, &i_newpos)) {
8960 Py_DECREF(restuple);
8961 return NULL;
8962 }
8963 if (i_newpos<0)
8964 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8965 else
8966 *newpos = i_newpos;
8967 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8968 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8969 Py_DECREF(restuple);
8970 return NULL;
8971 }
8972 Py_INCREF(resunicode);
8973 Py_DECREF(restuple);
8974 return resunicode;
8975 }
8976
8977 /* Lookup the character ch in the mapping and put the result in result,
8978 which must be decrefed by the caller.
8979 Return 0 on success, -1 on error */
8980 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8981 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8982 {
8983 PyObject *w = PyLong_FromLong((long)c);
8984 PyObject *x;
8985
8986 if (w == NULL)
8987 return -1;
8988 x = PyObject_GetItem(mapping, w);
8989 Py_DECREF(w);
8990 if (x == NULL) {
8991 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8992 /* No mapping found means: use 1:1 mapping. */
8993 PyErr_Clear();
8994 *result = NULL;
8995 return 0;
8996 } else
8997 return -1;
8998 }
8999 else if (x == Py_None) {
9000 *result = x;
9001 return 0;
9002 }
9003 else if (PyLong_Check(x)) {
9004 long value = PyLong_AS_LONG(x);
9005 if (value < 0 || value > MAX_UNICODE) {
9006 PyErr_Format(PyExc_ValueError,
9007 "character mapping must be in range(0x%x)",
9008 MAX_UNICODE+1);
9009 Py_DECREF(x);
9010 return -1;
9011 }
9012 *result = x;
9013 return 0;
9014 }
9015 else if (PyUnicode_Check(x)) {
9016 *result = x;
9017 return 0;
9018 }
9019 else {
9020 /* wrong return value */
9021 PyErr_SetString(PyExc_TypeError,
9022 "character mapping must return integer, None or str");
9023 Py_DECREF(x);
9024 return -1;
9025 }
9026 }
9027
9028 /* lookup the character, write the result into the writer.
9029 Return 1 if the result was written into the writer, return 0 if the mapping
9030 was undefined, raise an exception return -1 on error. */
9031 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)9032 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9033 _PyUnicodeWriter *writer)
9034 {
9035 PyObject *item;
9036
9037 if (charmaptranslate_lookup(ch, mapping, &item))
9038 return -1;
9039
9040 if (item == NULL) {
9041 /* not found => default to 1:1 mapping */
9042 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9043 return -1;
9044 }
9045 return 1;
9046 }
9047
9048 if (item == Py_None) {
9049 Py_DECREF(item);
9050 return 0;
9051 }
9052
9053 if (PyLong_Check(item)) {
9054 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9055 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9056 used it */
9057 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9058 Py_DECREF(item);
9059 return -1;
9060 }
9061 Py_DECREF(item);
9062 return 1;
9063 }
9064
9065 if (!PyUnicode_Check(item)) {
9066 Py_DECREF(item);
9067 return -1;
9068 }
9069
9070 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9071 Py_DECREF(item);
9072 return -1;
9073 }
9074
9075 Py_DECREF(item);
9076 return 1;
9077 }
9078
9079 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)9080 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9081 Py_UCS1 *translate)
9082 {
9083 PyObject *item = NULL;
9084 int ret = 0;
9085
9086 if (charmaptranslate_lookup(ch, mapping, &item)) {
9087 return -1;
9088 }
9089
9090 if (item == Py_None) {
9091 /* deletion */
9092 translate[ch] = 0xfe;
9093 }
9094 else if (item == NULL) {
9095 /* not found => default to 1:1 mapping */
9096 translate[ch] = ch;
9097 return 1;
9098 }
9099 else if (PyLong_Check(item)) {
9100 long replace = PyLong_AS_LONG(item);
9101 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9102 used it */
9103 if (127 < replace) {
9104 /* invalid character or character outside ASCII:
9105 skip the fast translate */
9106 goto exit;
9107 }
9108 translate[ch] = (Py_UCS1)replace;
9109 }
9110 else if (PyUnicode_Check(item)) {
9111 Py_UCS4 replace;
9112
9113 if (PyUnicode_READY(item) == -1) {
9114 Py_DECREF(item);
9115 return -1;
9116 }
9117 if (PyUnicode_GET_LENGTH(item) != 1)
9118 goto exit;
9119
9120 replace = PyUnicode_READ_CHAR(item, 0);
9121 if (replace > 127)
9122 goto exit;
9123 translate[ch] = (Py_UCS1)replace;
9124 }
9125 else {
9126 /* not None, NULL, long or unicode */
9127 goto exit;
9128 }
9129 ret = 1;
9130
9131 exit:
9132 Py_DECREF(item);
9133 return ret;
9134 }
9135
9136 /* Fast path for ascii => ascii translation. Return 1 if the whole string
9137 was translated into writer, return 0 if the input string was partially
9138 translated into writer, raise an exception and return -1 on error. */
9139 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)9140 unicode_fast_translate(PyObject *input, PyObject *mapping,
9141 _PyUnicodeWriter *writer, int ignore,
9142 Py_ssize_t *input_pos)
9143 {
9144 Py_UCS1 ascii_table[128], ch, ch2;
9145 Py_ssize_t len;
9146 const Py_UCS1 *in, *end;
9147 Py_UCS1 *out;
9148 int res = 0;
9149
9150 len = PyUnicode_GET_LENGTH(input);
9151
9152 memset(ascii_table, 0xff, 128);
9153
9154 in = PyUnicode_1BYTE_DATA(input);
9155 end = in + len;
9156
9157 assert(PyUnicode_IS_ASCII(writer->buffer));
9158 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9159 out = PyUnicode_1BYTE_DATA(writer->buffer);
9160
9161 for (; in < end; in++) {
9162 ch = *in;
9163 ch2 = ascii_table[ch];
9164 if (ch2 == 0xff) {
9165 int translate = unicode_fast_translate_lookup(mapping, ch,
9166 ascii_table);
9167 if (translate < 0)
9168 return -1;
9169 if (translate == 0)
9170 goto exit;
9171 ch2 = ascii_table[ch];
9172 }
9173 if (ch2 == 0xfe) {
9174 if (ignore)
9175 continue;
9176 goto exit;
9177 }
9178 assert(ch2 < 128);
9179 *out = ch2;
9180 out++;
9181 }
9182 res = 1;
9183
9184 exit:
9185 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9186 *input_pos = in - PyUnicode_1BYTE_DATA(input);
9187 return res;
9188 }
9189
9190 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)9191 _PyUnicode_TranslateCharmap(PyObject *input,
9192 PyObject *mapping,
9193 const char *errors)
9194 {
9195 /* input object */
9196 const void *data;
9197 Py_ssize_t size, i;
9198 int kind;
9199 /* output buffer */
9200 _PyUnicodeWriter writer;
9201 /* error handler */
9202 const char *reason = "character maps to <undefined>";
9203 PyObject *errorHandler = NULL;
9204 PyObject *exc = NULL;
9205 int ignore;
9206 int res;
9207
9208 if (mapping == NULL) {
9209 PyErr_BadArgument();
9210 return NULL;
9211 }
9212
9213 if (PyUnicode_READY(input) == -1)
9214 return NULL;
9215 data = PyUnicode_DATA(input);
9216 kind = PyUnicode_KIND(input);
9217 size = PyUnicode_GET_LENGTH(input);
9218
9219 if (size == 0)
9220 return PyUnicode_FromObject(input);
9221
9222 /* allocate enough for a simple 1:1 translation without
9223 replacements, if we need more, we'll resize */
9224 _PyUnicodeWriter_Init(&writer);
9225 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9226 goto onError;
9227
9228 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9229
9230 if (PyUnicode_READY(input) == -1)
9231 return NULL;
9232 if (PyUnicode_IS_ASCII(input)) {
9233 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9234 if (res < 0) {
9235 _PyUnicodeWriter_Dealloc(&writer);
9236 return NULL;
9237 }
9238 if (res == 1)
9239 return _PyUnicodeWriter_Finish(&writer);
9240 }
9241 else {
9242 i = 0;
9243 }
9244
9245 while (i<size) {
9246 /* try to encode it */
9247 int translate;
9248 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9249 Py_ssize_t newpos;
9250 /* startpos for collecting untranslatable chars */
9251 Py_ssize_t collstart;
9252 Py_ssize_t collend;
9253 Py_UCS4 ch;
9254
9255 ch = PyUnicode_READ(kind, data, i);
9256 translate = charmaptranslate_output(ch, mapping, &writer);
9257 if (translate < 0)
9258 goto onError;
9259
9260 if (translate != 0) {
9261 /* it worked => adjust input pointer */
9262 ++i;
9263 continue;
9264 }
9265
9266 /* untranslatable character */
9267 collstart = i;
9268 collend = i+1;
9269
9270 /* find all untranslatable characters */
9271 while (collend < size) {
9272 PyObject *x;
9273 ch = PyUnicode_READ(kind, data, collend);
9274 if (charmaptranslate_lookup(ch, mapping, &x))
9275 goto onError;
9276 Py_XDECREF(x);
9277 if (x != Py_None)
9278 break;
9279 ++collend;
9280 }
9281
9282 if (ignore) {
9283 i = collend;
9284 }
9285 else {
9286 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9287 reason, input, &exc,
9288 collstart, collend, &newpos);
9289 if (repunicode == NULL)
9290 goto onError;
9291 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9292 Py_DECREF(repunicode);
9293 goto onError;
9294 }
9295 Py_DECREF(repunicode);
9296 i = newpos;
9297 }
9298 }
9299 Py_XDECREF(exc);
9300 Py_XDECREF(errorHandler);
9301 return _PyUnicodeWriter_Finish(&writer);
9302
9303 onError:
9304 _PyUnicodeWriter_Dealloc(&writer);
9305 Py_XDECREF(exc);
9306 Py_XDECREF(errorHandler);
9307 return NULL;
9308 }
9309
9310 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9311 PyUnicode_Translate(PyObject *str,
9312 PyObject *mapping,
9313 const char *errors)
9314 {
9315 if (ensure_unicode(str) < 0)
9316 return NULL;
9317 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9318 }
9319
9320 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9321 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9322 {
9323 if (!PyUnicode_Check(unicode)) {
9324 PyErr_BadInternalCall();
9325 return NULL;
9326 }
9327 if (PyUnicode_READY(unicode) == -1)
9328 return NULL;
9329 if (PyUnicode_IS_ASCII(unicode)) {
9330 /* If the string is already ASCII, just return the same string */
9331 Py_INCREF(unicode);
9332 return unicode;
9333 }
9334
9335 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9336 PyObject *result = PyUnicode_New(len, 127);
9337 if (result == NULL) {
9338 return NULL;
9339 }
9340
9341 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9342 int kind = PyUnicode_KIND(unicode);
9343 const void *data = PyUnicode_DATA(unicode);
9344 Py_ssize_t i;
9345 for (i = 0; i < len; ++i) {
9346 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9347 if (ch < 127) {
9348 out[i] = ch;
9349 }
9350 else if (Py_UNICODE_ISSPACE(ch)) {
9351 out[i] = ' ';
9352 }
9353 else {
9354 int decimal = Py_UNICODE_TODECIMAL(ch);
9355 if (decimal < 0) {
9356 out[i] = '?';
9357 out[i+1] = '\0';
9358 _PyUnicode_LENGTH(result) = i + 1;
9359 break;
9360 }
9361 out[i] = '0' + decimal;
9362 }
9363 }
9364
9365 assert(_PyUnicode_CheckConsistency(result, 1));
9366 return result;
9367 }
9368
9369 /* --- Helpers ------------------------------------------------------------ */
9370
9371 /* helper macro to fixup start/end slice values */
9372 #define ADJUST_INDICES(start, end, len) \
9373 if (end > len) \
9374 end = len; \
9375 else if (end < 0) { \
9376 end += len; \
9377 if (end < 0) \
9378 end = 0; \
9379 } \
9380 if (start < 0) { \
9381 start += len; \
9382 if (start < 0) \
9383 start = 0; \
9384 }
9385
9386 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9387 any_find_slice(PyObject* s1, PyObject* s2,
9388 Py_ssize_t start,
9389 Py_ssize_t end,
9390 int direction)
9391 {
9392 int kind1, kind2;
9393 const void *buf1, *buf2;
9394 Py_ssize_t len1, len2, result;
9395
9396 kind1 = PyUnicode_KIND(s1);
9397 kind2 = PyUnicode_KIND(s2);
9398 if (kind1 < kind2)
9399 return -1;
9400
9401 len1 = PyUnicode_GET_LENGTH(s1);
9402 len2 = PyUnicode_GET_LENGTH(s2);
9403 ADJUST_INDICES(start, end, len1);
9404 if (end - start < len2)
9405 return -1;
9406
9407 buf1 = PyUnicode_DATA(s1);
9408 buf2 = PyUnicode_DATA(s2);
9409 if (len2 == 1) {
9410 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9411 result = findchar((const char *)buf1 + kind1*start,
9412 kind1, end - start, ch, direction);
9413 if (result == -1)
9414 return -1;
9415 else
9416 return start + result;
9417 }
9418
9419 if (kind2 != kind1) {
9420 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9421 if (!buf2)
9422 return -2;
9423 }
9424
9425 if (direction > 0) {
9426 switch (kind1) {
9427 case PyUnicode_1BYTE_KIND:
9428 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9429 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9430 else
9431 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9432 break;
9433 case PyUnicode_2BYTE_KIND:
9434 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9435 break;
9436 case PyUnicode_4BYTE_KIND:
9437 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9438 break;
9439 default:
9440 Py_UNREACHABLE();
9441 }
9442 }
9443 else {
9444 switch (kind1) {
9445 case PyUnicode_1BYTE_KIND:
9446 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9447 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9448 else
9449 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9450 break;
9451 case PyUnicode_2BYTE_KIND:
9452 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9453 break;
9454 case PyUnicode_4BYTE_KIND:
9455 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9456 break;
9457 default:
9458 Py_UNREACHABLE();
9459 }
9460 }
9461
9462 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9463 if (kind2 != kind1)
9464 PyMem_Free((void *)buf2);
9465
9466 return result;
9467 }
9468
9469 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9470 #include "stringlib/localeutil.h"
9471
9472 /**
9473 * InsertThousandsGrouping:
9474 * @writer: Unicode writer.
9475 * @n_buffer: Number of characters in @buffer.
9476 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9477 * @d_pos: Start of digits string.
9478 * @n_digits: The number of digits in the string, in which we want
9479 * to put the grouping chars.
9480 * @min_width: The minimum width of the digits in the output string.
9481 * Output will be zero-padded on the left to fill.
9482 * @grouping: see definition in localeconv().
9483 * @thousands_sep: see definition in localeconv().
9484 *
9485 * There are 2 modes: counting and filling. If @writer is NULL,
9486 * we are in counting mode, else filling mode.
9487 * If counting, the required buffer size is returned.
9488 * If filling, we know the buffer will be large enough, so we don't
9489 * need to pass in the buffer size.
9490 * Inserts thousand grouping characters (as defined by grouping and
9491 * thousands_sep) into @writer.
9492 *
9493 * Return value: -1 on error, number of characters otherwise.
9494 **/
9495 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9496 _PyUnicode_InsertThousandsGrouping(
9497 _PyUnicodeWriter *writer,
9498 Py_ssize_t n_buffer,
9499 PyObject *digits,
9500 Py_ssize_t d_pos,
9501 Py_ssize_t n_digits,
9502 Py_ssize_t min_width,
9503 const char *grouping,
9504 PyObject *thousands_sep,
9505 Py_UCS4 *maxchar)
9506 {
9507 min_width = Py_MAX(0, min_width);
9508 if (writer) {
9509 assert(digits != NULL);
9510 assert(maxchar == NULL);
9511 }
9512 else {
9513 assert(digits == NULL);
9514 assert(maxchar != NULL);
9515 }
9516 assert(0 <= d_pos);
9517 assert(0 <= n_digits);
9518 assert(grouping != NULL);
9519
9520 if (digits != NULL) {
9521 if (PyUnicode_READY(digits) == -1) {
9522 return -1;
9523 }
9524 }
9525 if (PyUnicode_READY(thousands_sep) == -1) {
9526 return -1;
9527 }
9528
9529 Py_ssize_t count = 0;
9530 Py_ssize_t n_zeros;
9531 int loop_broken = 0;
9532 int use_separator = 0; /* First time through, don't append the
9533 separator. They only go between
9534 groups. */
9535 Py_ssize_t buffer_pos;
9536 Py_ssize_t digits_pos;
9537 Py_ssize_t len;
9538 Py_ssize_t n_chars;
9539 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9540 be looked at */
9541 /* A generator that returns all of the grouping widths, until it
9542 returns 0. */
9543 GroupGenerator groupgen;
9544 GroupGenerator_init(&groupgen, grouping);
9545 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9546
9547 /* if digits are not grouped, thousands separator
9548 should be an empty string */
9549 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9550
9551 digits_pos = d_pos + n_digits;
9552 if (writer) {
9553 buffer_pos = writer->pos + n_buffer;
9554 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9555 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9556 }
9557 else {
9558 buffer_pos = n_buffer;
9559 }
9560
9561 if (!writer) {
9562 *maxchar = 127;
9563 }
9564
9565 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9566 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9567 n_zeros = Py_MAX(0, len - remaining);
9568 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9569
9570 /* Use n_zero zero's and n_chars chars */
9571
9572 /* Count only, don't do anything. */
9573 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9574
9575 /* Copy into the writer. */
9576 InsertThousandsGrouping_fill(writer, &buffer_pos,
9577 digits, &digits_pos,
9578 n_chars, n_zeros,
9579 use_separator ? thousands_sep : NULL,
9580 thousands_sep_len, maxchar);
9581
9582 /* Use a separator next time. */
9583 use_separator = 1;
9584
9585 remaining -= n_chars;
9586 min_width -= len;
9587
9588 if (remaining <= 0 && min_width <= 0) {
9589 loop_broken = 1;
9590 break;
9591 }
9592 min_width -= thousands_sep_len;
9593 }
9594 if (!loop_broken) {
9595 /* We left the loop without using a break statement. */
9596
9597 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9598 n_zeros = Py_MAX(0, len - remaining);
9599 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9600
9601 /* Use n_zero zero's and n_chars chars */
9602 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9603
9604 /* Copy into the writer. */
9605 InsertThousandsGrouping_fill(writer, &buffer_pos,
9606 digits, &digits_pos,
9607 n_chars, n_zeros,
9608 use_separator ? thousands_sep : NULL,
9609 thousands_sep_len, maxchar);
9610 }
9611 return count;
9612 }
9613
9614
9615 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9616 PyUnicode_Count(PyObject *str,
9617 PyObject *substr,
9618 Py_ssize_t start,
9619 Py_ssize_t end)
9620 {
9621 Py_ssize_t result;
9622 int kind1, kind2;
9623 const void *buf1 = NULL, *buf2 = NULL;
9624 Py_ssize_t len1, len2;
9625
9626 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9627 return -1;
9628
9629 kind1 = PyUnicode_KIND(str);
9630 kind2 = PyUnicode_KIND(substr);
9631 if (kind1 < kind2)
9632 return 0;
9633
9634 len1 = PyUnicode_GET_LENGTH(str);
9635 len2 = PyUnicode_GET_LENGTH(substr);
9636 ADJUST_INDICES(start, end, len1);
9637 if (end - start < len2)
9638 return 0;
9639
9640 buf1 = PyUnicode_DATA(str);
9641 buf2 = PyUnicode_DATA(substr);
9642 if (kind2 != kind1) {
9643 buf2 = unicode_askind(kind2, buf2, len2, kind1);
9644 if (!buf2)
9645 goto onError;
9646 }
9647
9648 switch (kind1) {
9649 case PyUnicode_1BYTE_KIND:
9650 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9651 result = asciilib_count(
9652 ((const Py_UCS1*)buf1) + start, end - start,
9653 buf2, len2, PY_SSIZE_T_MAX
9654 );
9655 else
9656 result = ucs1lib_count(
9657 ((const Py_UCS1*)buf1) + start, end - start,
9658 buf2, len2, PY_SSIZE_T_MAX
9659 );
9660 break;
9661 case PyUnicode_2BYTE_KIND:
9662 result = ucs2lib_count(
9663 ((const Py_UCS2*)buf1) + start, end - start,
9664 buf2, len2, PY_SSIZE_T_MAX
9665 );
9666 break;
9667 case PyUnicode_4BYTE_KIND:
9668 result = ucs4lib_count(
9669 ((const Py_UCS4*)buf1) + start, end - start,
9670 buf2, len2, PY_SSIZE_T_MAX
9671 );
9672 break;
9673 default:
9674 Py_UNREACHABLE();
9675 }
9676
9677 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9678 if (kind2 != kind1)
9679 PyMem_Free((void *)buf2);
9680
9681 return result;
9682 onError:
9683 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9684 if (kind2 != kind1)
9685 PyMem_Free((void *)buf2);
9686 return -1;
9687 }
9688
9689 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9690 PyUnicode_Find(PyObject *str,
9691 PyObject *substr,
9692 Py_ssize_t start,
9693 Py_ssize_t end,
9694 int direction)
9695 {
9696 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9697 return -2;
9698
9699 return any_find_slice(str, substr, start, end, direction);
9700 }
9701
9702 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9703 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9704 Py_ssize_t start, Py_ssize_t end,
9705 int direction)
9706 {
9707 int kind;
9708 Py_ssize_t len, result;
9709 if (PyUnicode_READY(str) == -1)
9710 return -2;
9711 len = PyUnicode_GET_LENGTH(str);
9712 ADJUST_INDICES(start, end, len);
9713 if (end - start < 1)
9714 return -1;
9715 kind = PyUnicode_KIND(str);
9716 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9717 kind, end-start, ch, direction);
9718 if (result == -1)
9719 return -1;
9720 else
9721 return start + result;
9722 }
9723
9724 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9725 tailmatch(PyObject *self,
9726 PyObject *substring,
9727 Py_ssize_t start,
9728 Py_ssize_t end,
9729 int direction)
9730 {
9731 int kind_self;
9732 int kind_sub;
9733 const void *data_self;
9734 const void *data_sub;
9735 Py_ssize_t offset;
9736 Py_ssize_t i;
9737 Py_ssize_t end_sub;
9738
9739 if (PyUnicode_READY(self) == -1 ||
9740 PyUnicode_READY(substring) == -1)
9741 return -1;
9742
9743 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9744 end -= PyUnicode_GET_LENGTH(substring);
9745 if (end < start)
9746 return 0;
9747
9748 if (PyUnicode_GET_LENGTH(substring) == 0)
9749 return 1;
9750
9751 kind_self = PyUnicode_KIND(self);
9752 data_self = PyUnicode_DATA(self);
9753 kind_sub = PyUnicode_KIND(substring);
9754 data_sub = PyUnicode_DATA(substring);
9755 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9756
9757 if (direction > 0)
9758 offset = end;
9759 else
9760 offset = start;
9761
9762 if (PyUnicode_READ(kind_self, data_self, offset) ==
9763 PyUnicode_READ(kind_sub, data_sub, 0) &&
9764 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9765 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9766 /* If both are of the same kind, memcmp is sufficient */
9767 if (kind_self == kind_sub) {
9768 return ! memcmp((char *)data_self +
9769 (offset * PyUnicode_KIND(substring)),
9770 data_sub,
9771 PyUnicode_GET_LENGTH(substring) *
9772 PyUnicode_KIND(substring));
9773 }
9774 /* otherwise we have to compare each character by first accessing it */
9775 else {
9776 /* We do not need to compare 0 and len(substring)-1 because
9777 the if statement above ensured already that they are equal
9778 when we end up here. */
9779 for (i = 1; i < end_sub; ++i) {
9780 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9781 PyUnicode_READ(kind_sub, data_sub, i))
9782 return 0;
9783 }
9784 return 1;
9785 }
9786 }
9787
9788 return 0;
9789 }
9790
9791 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9792 PyUnicode_Tailmatch(PyObject *str,
9793 PyObject *substr,
9794 Py_ssize_t start,
9795 Py_ssize_t end,
9796 int direction)
9797 {
9798 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9799 return -1;
9800
9801 return tailmatch(str, substr, start, end, direction);
9802 }
9803
9804 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9805 ascii_upper_or_lower(PyObject *self, int lower)
9806 {
9807 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9808 const char *data = PyUnicode_DATA(self);
9809 char *resdata;
9810 PyObject *res;
9811
9812 res = PyUnicode_New(len, 127);
9813 if (res == NULL)
9814 return NULL;
9815 resdata = PyUnicode_DATA(res);
9816 if (lower)
9817 _Py_bytes_lower(resdata, data, len);
9818 else
9819 _Py_bytes_upper(resdata, data, len);
9820 return res;
9821 }
9822
9823 static Py_UCS4
handle_capital_sigma(int kind,const void * data,Py_ssize_t length,Py_ssize_t i)9824 handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9825 {
9826 Py_ssize_t j;
9827 int final_sigma;
9828 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9829 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9830
9831 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9832
9833 where ! is a negation and \p{xxx} is a character with property xxx.
9834 */
9835 for (j = i - 1; j >= 0; j--) {
9836 c = PyUnicode_READ(kind, data, j);
9837 if (!_PyUnicode_IsCaseIgnorable(c))
9838 break;
9839 }
9840 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9841 if (final_sigma) {
9842 for (j = i + 1; j < length; j++) {
9843 c = PyUnicode_READ(kind, data, j);
9844 if (!_PyUnicode_IsCaseIgnorable(c))
9845 break;
9846 }
9847 final_sigma = j == length || !_PyUnicode_IsCased(c);
9848 }
9849 return (final_sigma) ? 0x3C2 : 0x3C3;
9850 }
9851
9852 static int
lower_ucs4(int kind,const void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9853 lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9854 Py_UCS4 c, Py_UCS4 *mapped)
9855 {
9856 /* Obscure special case. */
9857 if (c == 0x3A3) {
9858 mapped[0] = handle_capital_sigma(kind, data, length, i);
9859 return 1;
9860 }
9861 return _PyUnicode_ToLowerFull(c, mapped);
9862 }
9863
9864 static Py_ssize_t
do_capitalize(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9865 do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9866 {
9867 Py_ssize_t i, k = 0;
9868 int n_res, j;
9869 Py_UCS4 c, mapped[3];
9870
9871 c = PyUnicode_READ(kind, data, 0);
9872 n_res = _PyUnicode_ToTitleFull(c, mapped);
9873 for (j = 0; j < n_res; j++) {
9874 *maxchar = Py_MAX(*maxchar, mapped[j]);
9875 res[k++] = mapped[j];
9876 }
9877 for (i = 1; i < length; i++) {
9878 c = PyUnicode_READ(kind, data, i);
9879 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9880 for (j = 0; j < n_res; j++) {
9881 *maxchar = Py_MAX(*maxchar, mapped[j]);
9882 res[k++] = mapped[j];
9883 }
9884 }
9885 return k;
9886 }
9887
9888 static Py_ssize_t
do_swapcase(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9889 do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9890 Py_ssize_t i, k = 0;
9891
9892 for (i = 0; i < length; i++) {
9893 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9894 int n_res, j;
9895 if (Py_UNICODE_ISUPPER(c)) {
9896 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9897 }
9898 else if (Py_UNICODE_ISLOWER(c)) {
9899 n_res = _PyUnicode_ToUpperFull(c, mapped);
9900 }
9901 else {
9902 n_res = 1;
9903 mapped[0] = c;
9904 }
9905 for (j = 0; j < n_res; j++) {
9906 *maxchar = Py_MAX(*maxchar, mapped[j]);
9907 res[k++] = mapped[j];
9908 }
9909 }
9910 return k;
9911 }
9912
9913 static Py_ssize_t
do_upper_or_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9914 do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9915 Py_UCS4 *maxchar, int lower)
9916 {
9917 Py_ssize_t i, k = 0;
9918
9919 for (i = 0; i < length; i++) {
9920 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9921 int n_res, j;
9922 if (lower)
9923 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9924 else
9925 n_res = _PyUnicode_ToUpperFull(c, mapped);
9926 for (j = 0; j < n_res; j++) {
9927 *maxchar = Py_MAX(*maxchar, mapped[j]);
9928 res[k++] = mapped[j];
9929 }
9930 }
9931 return k;
9932 }
9933
9934 static Py_ssize_t
do_upper(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9935 do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9936 {
9937 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9938 }
9939
9940 static Py_ssize_t
do_lower(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9941 do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9942 {
9943 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9944 }
9945
9946 static Py_ssize_t
do_casefold(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9947 do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9948 {
9949 Py_ssize_t i, k = 0;
9950
9951 for (i = 0; i < length; i++) {
9952 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9953 Py_UCS4 mapped[3];
9954 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9955 for (j = 0; j < n_res; j++) {
9956 *maxchar = Py_MAX(*maxchar, mapped[j]);
9957 res[k++] = mapped[j];
9958 }
9959 }
9960 return k;
9961 }
9962
9963 static Py_ssize_t
do_title(int kind,const void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9964 do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9965 {
9966 Py_ssize_t i, k = 0;
9967 int previous_is_cased;
9968
9969 previous_is_cased = 0;
9970 for (i = 0; i < length; i++) {
9971 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9972 Py_UCS4 mapped[3];
9973 int n_res, j;
9974
9975 if (previous_is_cased)
9976 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9977 else
9978 n_res = _PyUnicode_ToTitleFull(c, mapped);
9979
9980 for (j = 0; j < n_res; j++) {
9981 *maxchar = Py_MAX(*maxchar, mapped[j]);
9982 res[k++] = mapped[j];
9983 }
9984
9985 previous_is_cased = _PyUnicode_IsCased(c);
9986 }
9987 return k;
9988 }
9989
9990 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,const void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9991 case_operation(PyObject *self,
9992 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9993 {
9994 PyObject *res = NULL;
9995 Py_ssize_t length, newlength = 0;
9996 int kind, outkind;
9997 const void *data;
9998 void *outdata;
9999 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10000
10001 assert(PyUnicode_IS_READY(self));
10002
10003 kind = PyUnicode_KIND(self);
10004 data = PyUnicode_DATA(self);
10005 length = PyUnicode_GET_LENGTH(self);
10006 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10007 PyErr_SetString(PyExc_OverflowError, "string is too long");
10008 return NULL;
10009 }
10010 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10011 if (tmp == NULL)
10012 return PyErr_NoMemory();
10013 newlength = perform(kind, data, length, tmp, &maxchar);
10014 res = PyUnicode_New(newlength, maxchar);
10015 if (res == NULL)
10016 goto leave;
10017 tmpend = tmp + newlength;
10018 outdata = PyUnicode_DATA(res);
10019 outkind = PyUnicode_KIND(res);
10020 switch (outkind) {
10021 case PyUnicode_1BYTE_KIND:
10022 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10023 break;
10024 case PyUnicode_2BYTE_KIND:
10025 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10026 break;
10027 case PyUnicode_4BYTE_KIND:
10028 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10029 break;
10030 default:
10031 Py_UNREACHABLE();
10032 }
10033 leave:
10034 PyMem_Free(tmp);
10035 return res;
10036 }
10037
10038 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)10039 PyUnicode_Join(PyObject *separator, PyObject *seq)
10040 {
10041 PyObject *res;
10042 PyObject *fseq;
10043 Py_ssize_t seqlen;
10044 PyObject **items;
10045
10046 fseq = PySequence_Fast(seq, "can only join an iterable");
10047 if (fseq == NULL) {
10048 return NULL;
10049 }
10050
10051 /* NOTE: the following code can't call back into Python code,
10052 * so we are sure that fseq won't be mutated.
10053 */
10054
10055 items = PySequence_Fast_ITEMS(fseq);
10056 seqlen = PySequence_Fast_GET_SIZE(fseq);
10057 res = _PyUnicode_JoinArray(separator, items, seqlen);
10058 Py_DECREF(fseq);
10059 return res;
10060 }
10061
10062 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)10063 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10064 {
10065 PyObject *res = NULL; /* the result */
10066 PyObject *sep = NULL;
10067 Py_ssize_t seplen;
10068 PyObject *item;
10069 Py_ssize_t sz, i, res_offset;
10070 Py_UCS4 maxchar;
10071 Py_UCS4 item_maxchar;
10072 int use_memcpy;
10073 unsigned char *res_data = NULL, *sep_data = NULL;
10074 PyObject *last_obj;
10075 unsigned int kind = 0;
10076
10077 /* If empty sequence, return u"". */
10078 if (seqlen == 0) {
10079 _Py_RETURN_UNICODE_EMPTY();
10080 }
10081
10082 /* If singleton sequence with an exact Unicode, return that. */
10083 last_obj = NULL;
10084 if (seqlen == 1) {
10085 if (PyUnicode_CheckExact(items[0])) {
10086 res = items[0];
10087 Py_INCREF(res);
10088 return res;
10089 }
10090 seplen = 0;
10091 maxchar = 0;
10092 }
10093 else {
10094 /* Set up sep and seplen */
10095 if (separator == NULL) {
10096 /* fall back to a blank space separator */
10097 sep = PyUnicode_FromOrdinal(' ');
10098 if (!sep)
10099 goto onError;
10100 seplen = 1;
10101 maxchar = 32;
10102 }
10103 else {
10104 if (!PyUnicode_Check(separator)) {
10105 PyErr_Format(PyExc_TypeError,
10106 "separator: expected str instance,"
10107 " %.80s found",
10108 Py_TYPE(separator)->tp_name);
10109 goto onError;
10110 }
10111 if (PyUnicode_READY(separator))
10112 goto onError;
10113 sep = separator;
10114 seplen = PyUnicode_GET_LENGTH(separator);
10115 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10116 /* inc refcount to keep this code path symmetric with the
10117 above case of a blank separator */
10118 Py_INCREF(sep);
10119 }
10120 last_obj = sep;
10121 }
10122
10123 /* There are at least two things to join, or else we have a subclass
10124 * of str in the sequence.
10125 * Do a pre-pass to figure out the total amount of space we'll
10126 * need (sz), and see whether all argument are strings.
10127 */
10128 sz = 0;
10129 #ifdef Py_DEBUG
10130 use_memcpy = 0;
10131 #else
10132 use_memcpy = 1;
10133 #endif
10134 for (i = 0; i < seqlen; i++) {
10135 size_t add_sz;
10136 item = items[i];
10137 if (!PyUnicode_Check(item)) {
10138 PyErr_Format(PyExc_TypeError,
10139 "sequence item %zd: expected str instance,"
10140 " %.80s found",
10141 i, Py_TYPE(item)->tp_name);
10142 goto onError;
10143 }
10144 if (PyUnicode_READY(item) == -1)
10145 goto onError;
10146 add_sz = PyUnicode_GET_LENGTH(item);
10147 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10148 maxchar = Py_MAX(maxchar, item_maxchar);
10149 if (i != 0) {
10150 add_sz += seplen;
10151 }
10152 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10153 PyErr_SetString(PyExc_OverflowError,
10154 "join() result is too long for a Python string");
10155 goto onError;
10156 }
10157 sz += add_sz;
10158 if (use_memcpy && last_obj != NULL) {
10159 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10160 use_memcpy = 0;
10161 }
10162 last_obj = item;
10163 }
10164
10165 res = PyUnicode_New(sz, maxchar);
10166 if (res == NULL)
10167 goto onError;
10168
10169 /* Catenate everything. */
10170 #ifdef Py_DEBUG
10171 use_memcpy = 0;
10172 #else
10173 if (use_memcpy) {
10174 res_data = PyUnicode_1BYTE_DATA(res);
10175 kind = PyUnicode_KIND(res);
10176 if (seplen != 0)
10177 sep_data = PyUnicode_1BYTE_DATA(sep);
10178 }
10179 #endif
10180 if (use_memcpy) {
10181 for (i = 0; i < seqlen; ++i) {
10182 Py_ssize_t itemlen;
10183 item = items[i];
10184
10185 /* Copy item, and maybe the separator. */
10186 if (i && seplen != 0) {
10187 memcpy(res_data,
10188 sep_data,
10189 kind * seplen);
10190 res_data += kind * seplen;
10191 }
10192
10193 itemlen = PyUnicode_GET_LENGTH(item);
10194 if (itemlen != 0) {
10195 memcpy(res_data,
10196 PyUnicode_DATA(item),
10197 kind * itemlen);
10198 res_data += kind * itemlen;
10199 }
10200 }
10201 assert(res_data == PyUnicode_1BYTE_DATA(res)
10202 + kind * PyUnicode_GET_LENGTH(res));
10203 }
10204 else {
10205 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10206 Py_ssize_t itemlen;
10207 item = items[i];
10208
10209 /* Copy item, and maybe the separator. */
10210 if (i && seplen != 0) {
10211 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10212 res_offset += seplen;
10213 }
10214
10215 itemlen = PyUnicode_GET_LENGTH(item);
10216 if (itemlen != 0) {
10217 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10218 res_offset += itemlen;
10219 }
10220 }
10221 assert(res_offset == PyUnicode_GET_LENGTH(res));
10222 }
10223
10224 Py_XDECREF(sep);
10225 assert(_PyUnicode_CheckConsistency(res, 1));
10226 return res;
10227
10228 onError:
10229 Py_XDECREF(sep);
10230 Py_XDECREF(res);
10231 return NULL;
10232 }
10233
10234 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10235 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10236 Py_UCS4 fill_char)
10237 {
10238 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10239 void *data = PyUnicode_DATA(unicode);
10240 assert(PyUnicode_IS_READY(unicode));
10241 assert(unicode_modifiable(unicode));
10242 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10243 assert(start >= 0);
10244 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10245 unicode_fill(kind, data, fill_char, start, length);
10246 }
10247
10248 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10249 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10250 Py_UCS4 fill_char)
10251 {
10252 Py_ssize_t maxlen;
10253
10254 if (!PyUnicode_Check(unicode)) {
10255 PyErr_BadInternalCall();
10256 return -1;
10257 }
10258 if (PyUnicode_READY(unicode) == -1)
10259 return -1;
10260 if (unicode_check_modifiable(unicode))
10261 return -1;
10262
10263 if (start < 0) {
10264 PyErr_SetString(PyExc_IndexError, "string index out of range");
10265 return -1;
10266 }
10267 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10268 PyErr_SetString(PyExc_ValueError,
10269 "fill character is bigger than "
10270 "the string maximum character");
10271 return -1;
10272 }
10273
10274 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10275 length = Py_MIN(maxlen, length);
10276 if (length <= 0)
10277 return 0;
10278
10279 _PyUnicode_FastFill(unicode, start, length, fill_char);
10280 return length;
10281 }
10282
10283 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10284 pad(PyObject *self,
10285 Py_ssize_t left,
10286 Py_ssize_t right,
10287 Py_UCS4 fill)
10288 {
10289 PyObject *u;
10290 Py_UCS4 maxchar;
10291 int kind;
10292 void *data;
10293
10294 if (left < 0)
10295 left = 0;
10296 if (right < 0)
10297 right = 0;
10298
10299 if (left == 0 && right == 0)
10300 return unicode_result_unchanged(self);
10301
10302 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10303 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10304 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10305 return NULL;
10306 }
10307 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10308 maxchar = Py_MAX(maxchar, fill);
10309 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10310 if (!u)
10311 return NULL;
10312
10313 kind = PyUnicode_KIND(u);
10314 data = PyUnicode_DATA(u);
10315 if (left)
10316 unicode_fill(kind, data, fill, 0, left);
10317 if (right)
10318 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10319 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10320 assert(_PyUnicode_CheckConsistency(u, 1));
10321 return u;
10322 }
10323
10324 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10325 PyUnicode_Splitlines(PyObject *string, int keepends)
10326 {
10327 PyObject *list;
10328
10329 if (ensure_unicode(string) < 0)
10330 return NULL;
10331
10332 switch (PyUnicode_KIND(string)) {
10333 case PyUnicode_1BYTE_KIND:
10334 if (PyUnicode_IS_ASCII(string))
10335 list = asciilib_splitlines(
10336 string, PyUnicode_1BYTE_DATA(string),
10337 PyUnicode_GET_LENGTH(string), keepends);
10338 else
10339 list = ucs1lib_splitlines(
10340 string, PyUnicode_1BYTE_DATA(string),
10341 PyUnicode_GET_LENGTH(string), keepends);
10342 break;
10343 case PyUnicode_2BYTE_KIND:
10344 list = ucs2lib_splitlines(
10345 string, PyUnicode_2BYTE_DATA(string),
10346 PyUnicode_GET_LENGTH(string), keepends);
10347 break;
10348 case PyUnicode_4BYTE_KIND:
10349 list = ucs4lib_splitlines(
10350 string, PyUnicode_4BYTE_DATA(string),
10351 PyUnicode_GET_LENGTH(string), keepends);
10352 break;
10353 default:
10354 Py_UNREACHABLE();
10355 }
10356 return list;
10357 }
10358
10359 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10360 split(PyObject *self,
10361 PyObject *substring,
10362 Py_ssize_t maxcount)
10363 {
10364 int kind1, kind2;
10365 const void *buf1, *buf2;
10366 Py_ssize_t len1, len2;
10367 PyObject* out;
10368
10369 if (maxcount < 0)
10370 maxcount = PY_SSIZE_T_MAX;
10371
10372 if (PyUnicode_READY(self) == -1)
10373 return NULL;
10374
10375 if (substring == NULL)
10376 switch (PyUnicode_KIND(self)) {
10377 case PyUnicode_1BYTE_KIND:
10378 if (PyUnicode_IS_ASCII(self))
10379 return asciilib_split_whitespace(
10380 self, PyUnicode_1BYTE_DATA(self),
10381 PyUnicode_GET_LENGTH(self), maxcount
10382 );
10383 else
10384 return ucs1lib_split_whitespace(
10385 self, PyUnicode_1BYTE_DATA(self),
10386 PyUnicode_GET_LENGTH(self), maxcount
10387 );
10388 case PyUnicode_2BYTE_KIND:
10389 return ucs2lib_split_whitespace(
10390 self, PyUnicode_2BYTE_DATA(self),
10391 PyUnicode_GET_LENGTH(self), maxcount
10392 );
10393 case PyUnicode_4BYTE_KIND:
10394 return ucs4lib_split_whitespace(
10395 self, PyUnicode_4BYTE_DATA(self),
10396 PyUnicode_GET_LENGTH(self), maxcount
10397 );
10398 default:
10399 Py_UNREACHABLE();
10400 }
10401
10402 if (PyUnicode_READY(substring) == -1)
10403 return NULL;
10404
10405 kind1 = PyUnicode_KIND(self);
10406 kind2 = PyUnicode_KIND(substring);
10407 len1 = PyUnicode_GET_LENGTH(self);
10408 len2 = PyUnicode_GET_LENGTH(substring);
10409 if (kind1 < kind2 || len1 < len2) {
10410 out = PyList_New(1);
10411 if (out == NULL)
10412 return NULL;
10413 Py_INCREF(self);
10414 PyList_SET_ITEM(out, 0, self);
10415 return out;
10416 }
10417 buf1 = PyUnicode_DATA(self);
10418 buf2 = PyUnicode_DATA(substring);
10419 if (kind2 != kind1) {
10420 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10421 if (!buf2)
10422 return NULL;
10423 }
10424
10425 switch (kind1) {
10426 case PyUnicode_1BYTE_KIND:
10427 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10428 out = asciilib_split(
10429 self, buf1, len1, buf2, len2, maxcount);
10430 else
10431 out = ucs1lib_split(
10432 self, buf1, len1, buf2, len2, maxcount);
10433 break;
10434 case PyUnicode_2BYTE_KIND:
10435 out = ucs2lib_split(
10436 self, buf1, len1, buf2, len2, maxcount);
10437 break;
10438 case PyUnicode_4BYTE_KIND:
10439 out = ucs4lib_split(
10440 self, buf1, len1, buf2, len2, maxcount);
10441 break;
10442 default:
10443 out = NULL;
10444 }
10445 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10446 if (kind2 != kind1)
10447 PyMem_Free((void *)buf2);
10448 return out;
10449 }
10450
10451 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10452 rsplit(PyObject *self,
10453 PyObject *substring,
10454 Py_ssize_t maxcount)
10455 {
10456 int kind1, kind2;
10457 const void *buf1, *buf2;
10458 Py_ssize_t len1, len2;
10459 PyObject* out;
10460
10461 if (maxcount < 0)
10462 maxcount = PY_SSIZE_T_MAX;
10463
10464 if (PyUnicode_READY(self) == -1)
10465 return NULL;
10466
10467 if (substring == NULL)
10468 switch (PyUnicode_KIND(self)) {
10469 case PyUnicode_1BYTE_KIND:
10470 if (PyUnicode_IS_ASCII(self))
10471 return asciilib_rsplit_whitespace(
10472 self, PyUnicode_1BYTE_DATA(self),
10473 PyUnicode_GET_LENGTH(self), maxcount
10474 );
10475 else
10476 return ucs1lib_rsplit_whitespace(
10477 self, PyUnicode_1BYTE_DATA(self),
10478 PyUnicode_GET_LENGTH(self), maxcount
10479 );
10480 case PyUnicode_2BYTE_KIND:
10481 return ucs2lib_rsplit_whitespace(
10482 self, PyUnicode_2BYTE_DATA(self),
10483 PyUnicode_GET_LENGTH(self), maxcount
10484 );
10485 case PyUnicode_4BYTE_KIND:
10486 return ucs4lib_rsplit_whitespace(
10487 self, PyUnicode_4BYTE_DATA(self),
10488 PyUnicode_GET_LENGTH(self), maxcount
10489 );
10490 default:
10491 Py_UNREACHABLE();
10492 }
10493
10494 if (PyUnicode_READY(substring) == -1)
10495 return NULL;
10496
10497 kind1 = PyUnicode_KIND(self);
10498 kind2 = PyUnicode_KIND(substring);
10499 len1 = PyUnicode_GET_LENGTH(self);
10500 len2 = PyUnicode_GET_LENGTH(substring);
10501 if (kind1 < kind2 || len1 < len2) {
10502 out = PyList_New(1);
10503 if (out == NULL)
10504 return NULL;
10505 Py_INCREF(self);
10506 PyList_SET_ITEM(out, 0, self);
10507 return out;
10508 }
10509 buf1 = PyUnicode_DATA(self);
10510 buf2 = PyUnicode_DATA(substring);
10511 if (kind2 != kind1) {
10512 buf2 = unicode_askind(kind2, buf2, len2, kind1);
10513 if (!buf2)
10514 return NULL;
10515 }
10516
10517 switch (kind1) {
10518 case PyUnicode_1BYTE_KIND:
10519 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10520 out = asciilib_rsplit(
10521 self, buf1, len1, buf2, len2, maxcount);
10522 else
10523 out = ucs1lib_rsplit(
10524 self, buf1, len1, buf2, len2, maxcount);
10525 break;
10526 case PyUnicode_2BYTE_KIND:
10527 out = ucs2lib_rsplit(
10528 self, buf1, len1, buf2, len2, maxcount);
10529 break;
10530 case PyUnicode_4BYTE_KIND:
10531 out = ucs4lib_rsplit(
10532 self, buf1, len1, buf2, len2, maxcount);
10533 break;
10534 default:
10535 out = NULL;
10536 }
10537 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10538 if (kind2 != kind1)
10539 PyMem_Free((void *)buf2);
10540 return out;
10541 }
10542
10543 static Py_ssize_t
anylib_find(int kind,PyObject * str1,const void * buf1,Py_ssize_t len1,PyObject * str2,const void * buf2,Py_ssize_t len2,Py_ssize_t offset)10544 anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10545 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10546 {
10547 switch (kind) {
10548 case PyUnicode_1BYTE_KIND:
10549 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10550 return asciilib_find(buf1, len1, buf2, len2, offset);
10551 else
10552 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10553 case PyUnicode_2BYTE_KIND:
10554 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10555 case PyUnicode_4BYTE_KIND:
10556 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10557 }
10558 Py_UNREACHABLE();
10559 }
10560
10561 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,const void * sbuf,Py_ssize_t slen,PyObject * str1,const void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10562 anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10563 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10564 {
10565 switch (kind) {
10566 case PyUnicode_1BYTE_KIND:
10567 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10568 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10569 else
10570 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10571 case PyUnicode_2BYTE_KIND:
10572 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10573 case PyUnicode_4BYTE_KIND:
10574 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10575 }
10576 Py_UNREACHABLE();
10577 }
10578
10579 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10580 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10581 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10582 {
10583 int kind = PyUnicode_KIND(u);
10584 void *data = PyUnicode_DATA(u);
10585 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10586 if (kind == PyUnicode_1BYTE_KIND) {
10587 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10588 (Py_UCS1 *)data + len,
10589 u1, u2, maxcount);
10590 }
10591 else if (kind == PyUnicode_2BYTE_KIND) {
10592 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10593 (Py_UCS2 *)data + len,
10594 u1, u2, maxcount);
10595 }
10596 else {
10597 assert(kind == PyUnicode_4BYTE_KIND);
10598 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10599 (Py_UCS4 *)data + len,
10600 u1, u2, maxcount);
10601 }
10602 }
10603
10604 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10605 replace(PyObject *self, PyObject *str1,
10606 PyObject *str2, Py_ssize_t maxcount)
10607 {
10608 PyObject *u;
10609 const char *sbuf = PyUnicode_DATA(self);
10610 const void *buf1 = PyUnicode_DATA(str1);
10611 const void *buf2 = PyUnicode_DATA(str2);
10612 int srelease = 0, release1 = 0, release2 = 0;
10613 int skind = PyUnicode_KIND(self);
10614 int kind1 = PyUnicode_KIND(str1);
10615 int kind2 = PyUnicode_KIND(str2);
10616 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10617 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10618 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10619 int mayshrink;
10620 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10621
10622 if (slen < len1)
10623 goto nothing;
10624
10625 if (maxcount < 0)
10626 maxcount = PY_SSIZE_T_MAX;
10627 else if (maxcount == 0)
10628 goto nothing;
10629
10630 if (str1 == str2)
10631 goto nothing;
10632
10633 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10634 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10635 if (maxchar < maxchar_str1)
10636 /* substring too wide to be present */
10637 goto nothing;
10638 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10639 /* Replacing str1 with str2 may cause a maxchar reduction in the
10640 result string. */
10641 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10642 maxchar = Py_MAX(maxchar, maxchar_str2);
10643
10644 if (len1 == len2) {
10645 /* same length */
10646 if (len1 == 0)
10647 goto nothing;
10648 if (len1 == 1) {
10649 /* replace characters */
10650 Py_UCS4 u1, u2;
10651 Py_ssize_t pos;
10652
10653 u1 = PyUnicode_READ(kind1, buf1, 0);
10654 pos = findchar(sbuf, skind, slen, u1, 1);
10655 if (pos < 0)
10656 goto nothing;
10657 u2 = PyUnicode_READ(kind2, buf2, 0);
10658 u = PyUnicode_New(slen, maxchar);
10659 if (!u)
10660 goto error;
10661
10662 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10663 replace_1char_inplace(u, pos, u1, u2, maxcount);
10664 }
10665 else {
10666 int rkind = skind;
10667 char *res;
10668 Py_ssize_t i;
10669
10670 if (kind1 < rkind) {
10671 /* widen substring */
10672 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10673 if (!buf1) goto error;
10674 release1 = 1;
10675 }
10676 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10677 if (i < 0)
10678 goto nothing;
10679 if (rkind > kind2) {
10680 /* widen replacement */
10681 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10682 if (!buf2) goto error;
10683 release2 = 1;
10684 }
10685 else if (rkind < kind2) {
10686 /* widen self and buf1 */
10687 rkind = kind2;
10688 if (release1) {
10689 assert(buf1 != PyUnicode_DATA(str1));
10690 PyMem_Free((void *)buf1);
10691 buf1 = PyUnicode_DATA(str1);
10692 release1 = 0;
10693 }
10694 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10695 if (!sbuf) goto error;
10696 srelease = 1;
10697 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10698 if (!buf1) goto error;
10699 release1 = 1;
10700 }
10701 u = PyUnicode_New(slen, maxchar);
10702 if (!u)
10703 goto error;
10704 assert(PyUnicode_KIND(u) == rkind);
10705 res = PyUnicode_DATA(u);
10706
10707 memcpy(res, sbuf, rkind * slen);
10708 /* change everything in-place, starting with this one */
10709 memcpy(res + rkind * i,
10710 buf2,
10711 rkind * len2);
10712 i += len1;
10713
10714 while ( --maxcount > 0) {
10715 i = anylib_find(rkind, self,
10716 sbuf+rkind*i, slen-i,
10717 str1, buf1, len1, i);
10718 if (i == -1)
10719 break;
10720 memcpy(res + rkind * i,
10721 buf2,
10722 rkind * len2);
10723 i += len1;
10724 }
10725 }
10726 }
10727 else {
10728 Py_ssize_t n, i, j, ires;
10729 Py_ssize_t new_size;
10730 int rkind = skind;
10731 char *res;
10732
10733 if (kind1 < rkind) {
10734 /* widen substring */
10735 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10736 if (!buf1) goto error;
10737 release1 = 1;
10738 }
10739 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10740 if (n == 0)
10741 goto nothing;
10742 if (kind2 < rkind) {
10743 /* widen replacement */
10744 buf2 = unicode_askind(kind2, buf2, len2, rkind);
10745 if (!buf2) goto error;
10746 release2 = 1;
10747 }
10748 else if (kind2 > rkind) {
10749 /* widen self and buf1 */
10750 rkind = kind2;
10751 sbuf = unicode_askind(skind, sbuf, slen, rkind);
10752 if (!sbuf) goto error;
10753 srelease = 1;
10754 if (release1) {
10755 assert(buf1 != PyUnicode_DATA(str1));
10756 PyMem_Free((void *)buf1);
10757 buf1 = PyUnicode_DATA(str1);
10758 release1 = 0;
10759 }
10760 buf1 = unicode_askind(kind1, buf1, len1, rkind);
10761 if (!buf1) goto error;
10762 release1 = 1;
10763 }
10764 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10765 PyUnicode_GET_LENGTH(str1)); */
10766 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10767 PyErr_SetString(PyExc_OverflowError,
10768 "replace string is too long");
10769 goto error;
10770 }
10771 new_size = slen + n * (len2 - len1);
10772 if (new_size == 0) {
10773 u = unicode_new_empty();
10774 goto done;
10775 }
10776 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10777 PyErr_SetString(PyExc_OverflowError,
10778 "replace string is too long");
10779 goto error;
10780 }
10781 u = PyUnicode_New(new_size, maxchar);
10782 if (!u)
10783 goto error;
10784 assert(PyUnicode_KIND(u) == rkind);
10785 res = PyUnicode_DATA(u);
10786 ires = i = 0;
10787 if (len1 > 0) {
10788 while (n-- > 0) {
10789 /* look for next match */
10790 j = anylib_find(rkind, self,
10791 sbuf + rkind * i, slen-i,
10792 str1, buf1, len1, i);
10793 if (j == -1)
10794 break;
10795 else if (j > i) {
10796 /* copy unchanged part [i:j] */
10797 memcpy(res + rkind * ires,
10798 sbuf + rkind * i,
10799 rkind * (j-i));
10800 ires += j - i;
10801 }
10802 /* copy substitution string */
10803 if (len2 > 0) {
10804 memcpy(res + rkind * ires,
10805 buf2,
10806 rkind * len2);
10807 ires += len2;
10808 }
10809 i = j + len1;
10810 }
10811 if (i < slen)
10812 /* copy tail [i:] */
10813 memcpy(res + rkind * ires,
10814 sbuf + rkind * i,
10815 rkind * (slen-i));
10816 }
10817 else {
10818 /* interleave */
10819 while (n > 0) {
10820 memcpy(res + rkind * ires,
10821 buf2,
10822 rkind * len2);
10823 ires += len2;
10824 if (--n <= 0)
10825 break;
10826 memcpy(res + rkind * ires,
10827 sbuf + rkind * i,
10828 rkind);
10829 ires++;
10830 i++;
10831 }
10832 memcpy(res + rkind * ires,
10833 sbuf + rkind * i,
10834 rkind * (slen-i));
10835 }
10836 }
10837
10838 if (mayshrink) {
10839 unicode_adjust_maxchar(&u);
10840 if (u == NULL)
10841 goto error;
10842 }
10843
10844 done:
10845 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10846 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10847 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10848 if (srelease)
10849 PyMem_Free((void *)sbuf);
10850 if (release1)
10851 PyMem_Free((void *)buf1);
10852 if (release2)
10853 PyMem_Free((void *)buf2);
10854 assert(_PyUnicode_CheckConsistency(u, 1));
10855 return u;
10856
10857 nothing:
10858 /* nothing to replace; return original string (when possible) */
10859 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10860 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10861 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10862 if (srelease)
10863 PyMem_Free((void *)sbuf);
10864 if (release1)
10865 PyMem_Free((void *)buf1);
10866 if (release2)
10867 PyMem_Free((void *)buf2);
10868 return unicode_result_unchanged(self);
10869
10870 error:
10871 assert(srelease == (sbuf != PyUnicode_DATA(self)));
10872 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10873 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10874 if (srelease)
10875 PyMem_Free((void *)sbuf);
10876 if (release1)
10877 PyMem_Free((void *)buf1);
10878 if (release2)
10879 PyMem_Free((void *)buf2);
10880 return NULL;
10881 }
10882
10883 /* --- Unicode Object Methods --------------------------------------------- */
10884
10885 /*[clinic input]
10886 str.title as unicode_title
10887
10888 Return a version of the string where each word is titlecased.
10889
10890 More specifically, words start with uppercased characters and all remaining
10891 cased characters have lower case.
10892 [clinic start generated code]*/
10893
10894 static PyObject *
unicode_title_impl(PyObject * self)10895 unicode_title_impl(PyObject *self)
10896 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10897 {
10898 if (PyUnicode_READY(self) == -1)
10899 return NULL;
10900 return case_operation(self, do_title);
10901 }
10902
10903 /*[clinic input]
10904 str.capitalize as unicode_capitalize
10905
10906 Return a capitalized version of the string.
10907
10908 More specifically, make the first character have upper case and the rest lower
10909 case.
10910 [clinic start generated code]*/
10911
10912 static PyObject *
unicode_capitalize_impl(PyObject * self)10913 unicode_capitalize_impl(PyObject *self)
10914 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10915 {
10916 if (PyUnicode_READY(self) == -1)
10917 return NULL;
10918 if (PyUnicode_GET_LENGTH(self) == 0)
10919 return unicode_result_unchanged(self);
10920 return case_operation(self, do_capitalize);
10921 }
10922
10923 /*[clinic input]
10924 str.casefold as unicode_casefold
10925
10926 Return a version of the string suitable for caseless comparisons.
10927 [clinic start generated code]*/
10928
10929 static PyObject *
unicode_casefold_impl(PyObject * self)10930 unicode_casefold_impl(PyObject *self)
10931 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10932 {
10933 if (PyUnicode_READY(self) == -1)
10934 return NULL;
10935 if (PyUnicode_IS_ASCII(self))
10936 return ascii_upper_or_lower(self, 1);
10937 return case_operation(self, do_casefold);
10938 }
10939
10940
10941 /* Argument converter. Accepts a single Unicode character. */
10942
10943 static int
convert_uc(PyObject * obj,void * addr)10944 convert_uc(PyObject *obj, void *addr)
10945 {
10946 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10947
10948 if (!PyUnicode_Check(obj)) {
10949 PyErr_Format(PyExc_TypeError,
10950 "The fill character must be a unicode character, "
10951 "not %.100s", Py_TYPE(obj)->tp_name);
10952 return 0;
10953 }
10954 if (PyUnicode_READY(obj) < 0)
10955 return 0;
10956 if (PyUnicode_GET_LENGTH(obj) != 1) {
10957 PyErr_SetString(PyExc_TypeError,
10958 "The fill character must be exactly one character long");
10959 return 0;
10960 }
10961 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10962 return 1;
10963 }
10964
10965 /*[clinic input]
10966 str.center as unicode_center
10967
10968 width: Py_ssize_t
10969 fillchar: Py_UCS4 = ' '
10970 /
10971
10972 Return a centered string of length width.
10973
10974 Padding is done using the specified fill character (default is a space).
10975 [clinic start generated code]*/
10976
10977 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10978 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10979 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10980 {
10981 Py_ssize_t marg, left;
10982
10983 if (PyUnicode_READY(self) == -1)
10984 return NULL;
10985
10986 if (PyUnicode_GET_LENGTH(self) >= width)
10987 return unicode_result_unchanged(self);
10988
10989 marg = width - PyUnicode_GET_LENGTH(self);
10990 left = marg / 2 + (marg & width & 1);
10991
10992 return pad(self, left, marg - left, fillchar);
10993 }
10994
10995 /* This function assumes that str1 and str2 are readied by the caller. */
10996
10997 static int
unicode_compare(PyObject * str1,PyObject * str2)10998 unicode_compare(PyObject *str1, PyObject *str2)
10999 {
11000 #define COMPARE(TYPE1, TYPE2) \
11001 do { \
11002 TYPE1* p1 = (TYPE1 *)data1; \
11003 TYPE2* p2 = (TYPE2 *)data2; \
11004 TYPE1* end = p1 + len; \
11005 Py_UCS4 c1, c2; \
11006 for (; p1 != end; p1++, p2++) { \
11007 c1 = *p1; \
11008 c2 = *p2; \
11009 if (c1 != c2) \
11010 return (c1 < c2) ? -1 : 1; \
11011 } \
11012 } \
11013 while (0)
11014
11015 int kind1, kind2;
11016 const void *data1, *data2;
11017 Py_ssize_t len1, len2, len;
11018
11019 kind1 = PyUnicode_KIND(str1);
11020 kind2 = PyUnicode_KIND(str2);
11021 data1 = PyUnicode_DATA(str1);
11022 data2 = PyUnicode_DATA(str2);
11023 len1 = PyUnicode_GET_LENGTH(str1);
11024 len2 = PyUnicode_GET_LENGTH(str2);
11025 len = Py_MIN(len1, len2);
11026
11027 switch(kind1) {
11028 case PyUnicode_1BYTE_KIND:
11029 {
11030 switch(kind2) {
11031 case PyUnicode_1BYTE_KIND:
11032 {
11033 int cmp = memcmp(data1, data2, len);
11034 /* normalize result of memcmp() into the range [-1; 1] */
11035 if (cmp < 0)
11036 return -1;
11037 if (cmp > 0)
11038 return 1;
11039 break;
11040 }
11041 case PyUnicode_2BYTE_KIND:
11042 COMPARE(Py_UCS1, Py_UCS2);
11043 break;
11044 case PyUnicode_4BYTE_KIND:
11045 COMPARE(Py_UCS1, Py_UCS4);
11046 break;
11047 default:
11048 Py_UNREACHABLE();
11049 }
11050 break;
11051 }
11052 case PyUnicode_2BYTE_KIND:
11053 {
11054 switch(kind2) {
11055 case PyUnicode_1BYTE_KIND:
11056 COMPARE(Py_UCS2, Py_UCS1);
11057 break;
11058 case PyUnicode_2BYTE_KIND:
11059 {
11060 COMPARE(Py_UCS2, Py_UCS2);
11061 break;
11062 }
11063 case PyUnicode_4BYTE_KIND:
11064 COMPARE(Py_UCS2, Py_UCS4);
11065 break;
11066 default:
11067 Py_UNREACHABLE();
11068 }
11069 break;
11070 }
11071 case PyUnicode_4BYTE_KIND:
11072 {
11073 switch(kind2) {
11074 case PyUnicode_1BYTE_KIND:
11075 COMPARE(Py_UCS4, Py_UCS1);
11076 break;
11077 case PyUnicode_2BYTE_KIND:
11078 COMPARE(Py_UCS4, Py_UCS2);
11079 break;
11080 case PyUnicode_4BYTE_KIND:
11081 {
11082 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11083 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11084 /* normalize result of wmemcmp() into the range [-1; 1] */
11085 if (cmp < 0)
11086 return -1;
11087 if (cmp > 0)
11088 return 1;
11089 #else
11090 COMPARE(Py_UCS4, Py_UCS4);
11091 #endif
11092 break;
11093 }
11094 default:
11095 Py_UNREACHABLE();
11096 }
11097 break;
11098 }
11099 default:
11100 Py_UNREACHABLE();
11101 }
11102
11103 if (len1 == len2)
11104 return 0;
11105 if (len1 < len2)
11106 return -1;
11107 else
11108 return 1;
11109
11110 #undef COMPARE
11111 }
11112
11113 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)11114 unicode_compare_eq(PyObject *str1, PyObject *str2)
11115 {
11116 int kind;
11117 const void *data1, *data2;
11118 Py_ssize_t len;
11119 int cmp;
11120
11121 len = PyUnicode_GET_LENGTH(str1);
11122 if (PyUnicode_GET_LENGTH(str2) != len)
11123 return 0;
11124 kind = PyUnicode_KIND(str1);
11125 if (PyUnicode_KIND(str2) != kind)
11126 return 0;
11127 data1 = PyUnicode_DATA(str1);
11128 data2 = PyUnicode_DATA(str2);
11129
11130 cmp = memcmp(data1, data2, len * kind);
11131 return (cmp == 0);
11132 }
11133
11134 int
_PyUnicode_Equal(PyObject * str1,PyObject * str2)11135 _PyUnicode_Equal(PyObject *str1, PyObject *str2)
11136 {
11137 assert(PyUnicode_Check(str1));
11138 assert(PyUnicode_Check(str2));
11139 if (str1 == str2) {
11140 return 1;
11141 }
11142 if (PyUnicode_READY(str1) || PyUnicode_READY(str2)) {
11143 return -1;
11144 }
11145 return unicode_compare_eq(str1, str2);
11146 }
11147
11148
11149 int
PyUnicode_Compare(PyObject * left,PyObject * right)11150 PyUnicode_Compare(PyObject *left, PyObject *right)
11151 {
11152 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11153 if (PyUnicode_READY(left) == -1 ||
11154 PyUnicode_READY(right) == -1)
11155 return -1;
11156
11157 /* a string is equal to itself */
11158 if (left == right)
11159 return 0;
11160
11161 return unicode_compare(left, right);
11162 }
11163 PyErr_Format(PyExc_TypeError,
11164 "Can't compare %.100s and %.100s",
11165 Py_TYPE(left)->tp_name,
11166 Py_TYPE(right)->tp_name);
11167 return -1;
11168 }
11169
11170 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11171 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11172 {
11173 Py_ssize_t i;
11174 int kind;
11175 Py_UCS4 chr;
11176 const unsigned char *ustr = (const unsigned char *)str;
11177
11178 assert(_PyUnicode_CHECK(uni));
11179 if (!PyUnicode_IS_READY(uni)) {
11180 const wchar_t *ws = _PyUnicode_WSTR(uni);
11181 /* Compare Unicode string and source character set string */
11182 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11183 if (chr != ustr[i])
11184 return (chr < ustr[i]) ? -1 : 1;
11185 }
11186 /* This check keeps Python strings that end in '\0' from comparing equal
11187 to C strings identical up to that point. */
11188 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11189 return 1; /* uni is longer */
11190 if (ustr[i])
11191 return -1; /* str is longer */
11192 return 0;
11193 }
11194 kind = PyUnicode_KIND(uni);
11195 if (kind == PyUnicode_1BYTE_KIND) {
11196 const void *data = PyUnicode_1BYTE_DATA(uni);
11197 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11198 size_t len, len2 = strlen(str);
11199 int cmp;
11200
11201 len = Py_MIN(len1, len2);
11202 cmp = memcmp(data, str, len);
11203 if (cmp != 0) {
11204 if (cmp < 0)
11205 return -1;
11206 else
11207 return 1;
11208 }
11209 if (len1 > len2)
11210 return 1; /* uni is longer */
11211 if (len1 < len2)
11212 return -1; /* str is longer */
11213 return 0;
11214 }
11215 else {
11216 const void *data = PyUnicode_DATA(uni);
11217 /* Compare Unicode string and source character set string */
11218 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11219 if (chr != (unsigned char)str[i])
11220 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11221 /* This check keeps Python strings that end in '\0' from comparing equal
11222 to C strings identical up to that point. */
11223 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11224 return 1; /* uni is longer */
11225 if (str[i])
11226 return -1; /* str is longer */
11227 return 0;
11228 }
11229 }
11230
11231 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11232 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11233 {
11234 size_t i, len;
11235 const wchar_t *p;
11236 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11237 if (strlen(str) != len)
11238 return 0;
11239 p = _PyUnicode_WSTR(unicode);
11240 assert(p);
11241 for (i = 0; i < len; i++) {
11242 unsigned char c = (unsigned char)str[i];
11243 if (c >= 128 || p[i] != (wchar_t)c)
11244 return 0;
11245 }
11246 return 1;
11247 }
11248
11249 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11250 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11251 {
11252 size_t len;
11253 assert(_PyUnicode_CHECK(unicode));
11254 assert(str);
11255 #ifndef NDEBUG
11256 for (const char *p = str; *p; p++) {
11257 assert((unsigned char)*p < 128);
11258 }
11259 #endif
11260 if (PyUnicode_READY(unicode) == -1) {
11261 /* Memory error or bad data */
11262 PyErr_Clear();
11263 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11264 }
11265 if (!PyUnicode_IS_ASCII(unicode))
11266 return 0;
11267 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11268 return strlen(str) == len &&
11269 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11270 }
11271
11272 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11273 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11274 {
11275 PyObject *right_uni;
11276
11277 assert(_PyUnicode_CHECK(left));
11278 assert(right->string);
11279 #ifndef NDEBUG
11280 for (const char *p = right->string; *p; p++) {
11281 assert((unsigned char)*p < 128);
11282 }
11283 #endif
11284
11285 if (PyUnicode_READY(left) == -1) {
11286 /* memory error or bad data */
11287 PyErr_Clear();
11288 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11289 }
11290
11291 if (!PyUnicode_IS_ASCII(left))
11292 return 0;
11293
11294 right_uni = _PyUnicode_FromId(right); /* borrowed */
11295 if (right_uni == NULL) {
11296 /* memory error or bad data */
11297 PyErr_Clear();
11298 return _PyUnicode_EqualToASCIIString(left, right->string);
11299 }
11300
11301 if (left == right_uni)
11302 return 1;
11303
11304 if (PyUnicode_CHECK_INTERNED(left))
11305 return 0;
11306
11307 assert(_PyUnicode_HASH(right_uni) != -1);
11308 Py_hash_t hash = _PyUnicode_HASH(left);
11309 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
11310 return 0;
11311 }
11312
11313 return unicode_compare_eq(left, right_uni);
11314 }
11315
11316 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11317 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11318 {
11319 int result;
11320
11321 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11322 Py_RETURN_NOTIMPLEMENTED;
11323
11324 if (PyUnicode_READY(left) == -1 ||
11325 PyUnicode_READY(right) == -1)
11326 return NULL;
11327
11328 if (left == right) {
11329 switch (op) {
11330 case Py_EQ:
11331 case Py_LE:
11332 case Py_GE:
11333 /* a string is equal to itself */
11334 Py_RETURN_TRUE;
11335 case Py_NE:
11336 case Py_LT:
11337 case Py_GT:
11338 Py_RETURN_FALSE;
11339 default:
11340 PyErr_BadArgument();
11341 return NULL;
11342 }
11343 }
11344 else if (op == Py_EQ || op == Py_NE) {
11345 result = unicode_compare_eq(left, right);
11346 result ^= (op == Py_NE);
11347 return PyBool_FromLong(result);
11348 }
11349 else {
11350 result = unicode_compare(left, right);
11351 Py_RETURN_RICHCOMPARE(result, 0, op);
11352 }
11353 }
11354
11355 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11356 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11357 {
11358 return unicode_eq(aa, bb);
11359 }
11360
11361 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11362 PyUnicode_Contains(PyObject *str, PyObject *substr)
11363 {
11364 int kind1, kind2;
11365 const void *buf1, *buf2;
11366 Py_ssize_t len1, len2;
11367 int result;
11368
11369 if (!PyUnicode_Check(substr)) {
11370 PyErr_Format(PyExc_TypeError,
11371 "'in <string>' requires string as left operand, not %.100s",
11372 Py_TYPE(substr)->tp_name);
11373 return -1;
11374 }
11375 if (PyUnicode_READY(substr) == -1)
11376 return -1;
11377 if (ensure_unicode(str) < 0)
11378 return -1;
11379
11380 kind1 = PyUnicode_KIND(str);
11381 kind2 = PyUnicode_KIND(substr);
11382 if (kind1 < kind2)
11383 return 0;
11384 len1 = PyUnicode_GET_LENGTH(str);
11385 len2 = PyUnicode_GET_LENGTH(substr);
11386 if (len1 < len2)
11387 return 0;
11388 buf1 = PyUnicode_DATA(str);
11389 buf2 = PyUnicode_DATA(substr);
11390 if (len2 == 1) {
11391 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11392 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11393 return result;
11394 }
11395 if (kind2 != kind1) {
11396 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11397 if (!buf2)
11398 return -1;
11399 }
11400
11401 switch (kind1) {
11402 case PyUnicode_1BYTE_KIND:
11403 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11404 break;
11405 case PyUnicode_2BYTE_KIND:
11406 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11407 break;
11408 case PyUnicode_4BYTE_KIND:
11409 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11410 break;
11411 default:
11412 Py_UNREACHABLE();
11413 }
11414
11415 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11416 if (kind2 != kind1)
11417 PyMem_Free((void *)buf2);
11418
11419 return result;
11420 }
11421
11422 /* Concat to string or Unicode object giving a new Unicode object. */
11423
11424 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11425 PyUnicode_Concat(PyObject *left, PyObject *right)
11426 {
11427 PyObject *result;
11428 Py_UCS4 maxchar, maxchar2;
11429 Py_ssize_t left_len, right_len, new_len;
11430
11431 if (ensure_unicode(left) < 0)
11432 return NULL;
11433
11434 if (!PyUnicode_Check(right)) {
11435 PyErr_Format(PyExc_TypeError,
11436 "can only concatenate str (not \"%.200s\") to str",
11437 Py_TYPE(right)->tp_name);
11438 return NULL;
11439 }
11440 if (PyUnicode_READY(right) < 0)
11441 return NULL;
11442
11443 /* Shortcuts */
11444 PyObject *empty = unicode_get_empty(); // Borrowed reference
11445 if (left == empty) {
11446 return PyUnicode_FromObject(right);
11447 }
11448 if (right == empty) {
11449 return PyUnicode_FromObject(left);
11450 }
11451
11452 left_len = PyUnicode_GET_LENGTH(left);
11453 right_len = PyUnicode_GET_LENGTH(right);
11454 if (left_len > PY_SSIZE_T_MAX - right_len) {
11455 PyErr_SetString(PyExc_OverflowError,
11456 "strings are too large to concat");
11457 return NULL;
11458 }
11459 new_len = left_len + right_len;
11460
11461 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11462 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11463 maxchar = Py_MAX(maxchar, maxchar2);
11464
11465 /* Concat the two Unicode strings */
11466 result = PyUnicode_New(new_len, maxchar);
11467 if (result == NULL)
11468 return NULL;
11469 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11470 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11471 assert(_PyUnicode_CheckConsistency(result, 1));
11472 return result;
11473 }
11474
11475 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11476 PyUnicode_Append(PyObject **p_left, PyObject *right)
11477 {
11478 PyObject *left, *res;
11479 Py_UCS4 maxchar, maxchar2;
11480 Py_ssize_t left_len, right_len, new_len;
11481
11482 if (p_left == NULL) {
11483 if (!PyErr_Occurred())
11484 PyErr_BadInternalCall();
11485 return;
11486 }
11487 left = *p_left;
11488 if (right == NULL || left == NULL
11489 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11490 if (!PyErr_Occurred())
11491 PyErr_BadInternalCall();
11492 goto error;
11493 }
11494
11495 if (PyUnicode_READY(left) == -1)
11496 goto error;
11497 if (PyUnicode_READY(right) == -1)
11498 goto error;
11499
11500 /* Shortcuts */
11501 PyObject *empty = unicode_get_empty(); // Borrowed reference
11502 if (left == empty) {
11503 Py_DECREF(left);
11504 Py_INCREF(right);
11505 *p_left = right;
11506 return;
11507 }
11508 if (right == empty) {
11509 return;
11510 }
11511
11512 left_len = PyUnicode_GET_LENGTH(left);
11513 right_len = PyUnicode_GET_LENGTH(right);
11514 if (left_len > PY_SSIZE_T_MAX - right_len) {
11515 PyErr_SetString(PyExc_OverflowError,
11516 "strings are too large to concat");
11517 goto error;
11518 }
11519 new_len = left_len + right_len;
11520
11521 if (unicode_modifiable(left)
11522 && PyUnicode_CheckExact(right)
11523 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11524 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11525 to change the structure size, but characters are stored just after
11526 the structure, and so it requires to move all characters which is
11527 not so different than duplicating the string. */
11528 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11529 {
11530 /* append inplace */
11531 if (unicode_resize(p_left, new_len) != 0)
11532 goto error;
11533
11534 /* copy 'right' into the newly allocated area of 'left' */
11535 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11536 }
11537 else {
11538 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11539 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11540 maxchar = Py_MAX(maxchar, maxchar2);
11541
11542 /* Concat the two Unicode strings */
11543 res = PyUnicode_New(new_len, maxchar);
11544 if (res == NULL)
11545 goto error;
11546 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11547 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11548 Py_DECREF(left);
11549 *p_left = res;
11550 }
11551 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11552 return;
11553
11554 error:
11555 Py_CLEAR(*p_left);
11556 }
11557
11558 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11559 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11560 {
11561 PyUnicode_Append(pleft, right);
11562 Py_XDECREF(right);
11563 }
11564
11565 /*
11566 Wraps stringlib_parse_args_finds() and additionally ensures that the
11567 first argument is a unicode object.
11568 */
11569
11570 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11571 parse_args_finds_unicode(const char * function_name, PyObject *args,
11572 PyObject **substring,
11573 Py_ssize_t *start, Py_ssize_t *end)
11574 {
11575 if(stringlib_parse_args_finds(function_name, args, substring,
11576 start, end)) {
11577 if (ensure_unicode(*substring) < 0)
11578 return 0;
11579 return 1;
11580 }
11581 return 0;
11582 }
11583
11584 PyDoc_STRVAR(count__doc__,
11585 "S.count(sub[, start[, end]]) -> int\n\
11586 \n\
11587 Return the number of non-overlapping occurrences of substring sub in\n\
11588 string S[start:end]. Optional arguments start and end are\n\
11589 interpreted as in slice notation.");
11590
11591 static PyObject *
unicode_count(PyObject * self,PyObject * args)11592 unicode_count(PyObject *self, PyObject *args)
11593 {
11594 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11595 Py_ssize_t start = 0;
11596 Py_ssize_t end = PY_SSIZE_T_MAX;
11597 PyObject *result;
11598 int kind1, kind2;
11599 const void *buf1, *buf2;
11600 Py_ssize_t len1, len2, iresult;
11601
11602 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11603 return NULL;
11604
11605 kind1 = PyUnicode_KIND(self);
11606 kind2 = PyUnicode_KIND(substring);
11607 if (kind1 < kind2)
11608 return PyLong_FromLong(0);
11609
11610 len1 = PyUnicode_GET_LENGTH(self);
11611 len2 = PyUnicode_GET_LENGTH(substring);
11612 ADJUST_INDICES(start, end, len1);
11613 if (end - start < len2)
11614 return PyLong_FromLong(0);
11615
11616 buf1 = PyUnicode_DATA(self);
11617 buf2 = PyUnicode_DATA(substring);
11618 if (kind2 != kind1) {
11619 buf2 = unicode_askind(kind2, buf2, len2, kind1);
11620 if (!buf2)
11621 return NULL;
11622 }
11623 switch (kind1) {
11624 case PyUnicode_1BYTE_KIND:
11625 iresult = ucs1lib_count(
11626 ((const Py_UCS1*)buf1) + start, end - start,
11627 buf2, len2, PY_SSIZE_T_MAX
11628 );
11629 break;
11630 case PyUnicode_2BYTE_KIND:
11631 iresult = ucs2lib_count(
11632 ((const Py_UCS2*)buf1) + start, end - start,
11633 buf2, len2, PY_SSIZE_T_MAX
11634 );
11635 break;
11636 case PyUnicode_4BYTE_KIND:
11637 iresult = ucs4lib_count(
11638 ((const Py_UCS4*)buf1) + start, end - start,
11639 buf2, len2, PY_SSIZE_T_MAX
11640 );
11641 break;
11642 default:
11643 Py_UNREACHABLE();
11644 }
11645
11646 result = PyLong_FromSsize_t(iresult);
11647
11648 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
11649 if (kind2 != kind1)
11650 PyMem_Free((void *)buf2);
11651
11652 return result;
11653 }
11654
11655 /*[clinic input]
11656 str.encode as unicode_encode
11657
11658 encoding: str(c_default="NULL") = 'utf-8'
11659 The encoding in which to encode the string.
11660 errors: str(c_default="NULL") = 'strict'
11661 The error handling scheme to use for encoding errors.
11662 The default is 'strict' meaning that encoding errors raise a
11663 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11664 'xmlcharrefreplace' as well as any other name registered with
11665 codecs.register_error that can handle UnicodeEncodeErrors.
11666
11667 Encode the string using the codec registered for encoding.
11668 [clinic start generated code]*/
11669
11670 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11671 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673 {
11674 return PyUnicode_AsEncodedString(self, encoding, errors);
11675 }
11676
11677 /*[clinic input]
11678 str.expandtabs as unicode_expandtabs
11679
11680 tabsize: int = 8
11681
11682 Return a copy where all tab characters are expanded using spaces.
11683
11684 If tabsize is not given, a tab size of 8 characters is assumed.
11685 [clinic start generated code]*/
11686
11687 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11688 unicode_expandtabs_impl(PyObject *self, int tabsize)
11689 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690 {
11691 Py_ssize_t i, j, line_pos, src_len, incr;
11692 Py_UCS4 ch;
11693 PyObject *u;
11694 const void *src_data;
11695 void *dest_data;
11696 int kind;
11697 int found;
11698
11699 if (PyUnicode_READY(self) == -1)
11700 return NULL;
11701
11702 /* First pass: determine size of output string */
11703 src_len = PyUnicode_GET_LENGTH(self);
11704 i = j = line_pos = 0;
11705 kind = PyUnicode_KIND(self);
11706 src_data = PyUnicode_DATA(self);
11707 found = 0;
11708 for (; i < src_len; i++) {
11709 ch = PyUnicode_READ(kind, src_data, i);
11710 if (ch == '\t') {
11711 found = 1;
11712 if (tabsize > 0) {
11713 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11714 if (j > PY_SSIZE_T_MAX - incr)
11715 goto overflow;
11716 line_pos += incr;
11717 j += incr;
11718 }
11719 }
11720 else {
11721 if (j > PY_SSIZE_T_MAX - 1)
11722 goto overflow;
11723 line_pos++;
11724 j++;
11725 if (ch == '\n' || ch == '\r')
11726 line_pos = 0;
11727 }
11728 }
11729 if (!found)
11730 return unicode_result_unchanged(self);
11731
11732 /* Second pass: create output string and fill it */
11733 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11734 if (!u)
11735 return NULL;
11736 dest_data = PyUnicode_DATA(u);
11737
11738 i = j = line_pos = 0;
11739
11740 for (; i < src_len; i++) {
11741 ch = PyUnicode_READ(kind, src_data, i);
11742 if (ch == '\t') {
11743 if (tabsize > 0) {
11744 incr = tabsize - (line_pos % tabsize);
11745 line_pos += incr;
11746 unicode_fill(kind, dest_data, ' ', j, incr);
11747 j += incr;
11748 }
11749 }
11750 else {
11751 line_pos++;
11752 PyUnicode_WRITE(kind, dest_data, j, ch);
11753 j++;
11754 if (ch == '\n' || ch == '\r')
11755 line_pos = 0;
11756 }
11757 }
11758 assert (j == PyUnicode_GET_LENGTH(u));
11759 return unicode_result(u);
11760
11761 overflow:
11762 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11763 return NULL;
11764 }
11765
11766 PyDoc_STRVAR(find__doc__,
11767 "S.find(sub[, start[, end]]) -> int\n\
11768 \n\
11769 Return the lowest index in S where substring sub is found,\n\
11770 such that sub is contained within S[start:end]. Optional\n\
11771 arguments start and end are interpreted as in slice notation.\n\
11772 \n\
11773 Return -1 on failure.");
11774
11775 static PyObject *
unicode_find(PyObject * self,PyObject * args)11776 unicode_find(PyObject *self, PyObject *args)
11777 {
11778 /* initialize variables to prevent gcc warning */
11779 PyObject *substring = NULL;
11780 Py_ssize_t start = 0;
11781 Py_ssize_t end = 0;
11782 Py_ssize_t result;
11783
11784 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11785 return NULL;
11786
11787 if (PyUnicode_READY(self) == -1)
11788 return NULL;
11789
11790 result = any_find_slice(self, substring, start, end, 1);
11791
11792 if (result == -2)
11793 return NULL;
11794
11795 return PyLong_FromSsize_t(result);
11796 }
11797
11798 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11799 unicode_getitem(PyObject *self, Py_ssize_t index)
11800 {
11801 const void *data;
11802 enum PyUnicode_Kind kind;
11803 Py_UCS4 ch;
11804
11805 if (!PyUnicode_Check(self)) {
11806 PyErr_BadArgument();
11807 return NULL;
11808 }
11809 if (PyUnicode_READY(self) == -1) {
11810 return NULL;
11811 }
11812 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11813 PyErr_SetString(PyExc_IndexError, "string index out of range");
11814 return NULL;
11815 }
11816 kind = PyUnicode_KIND(self);
11817 data = PyUnicode_DATA(self);
11818 ch = PyUnicode_READ(kind, data, index);
11819 return unicode_char(ch);
11820 }
11821
11822 /* Believe it or not, this produces the same value for ASCII strings
11823 as bytes_hash(). */
11824 static Py_hash_t
unicode_hash(PyObject * self)11825 unicode_hash(PyObject *self)
11826 {
11827 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11828
11829 #ifdef Py_DEBUG
11830 assert(_Py_HashSecret_Initialized);
11831 #endif
11832 if (_PyUnicode_HASH(self) != -1)
11833 return _PyUnicode_HASH(self);
11834 if (PyUnicode_READY(self) == -1)
11835 return -1;
11836
11837 x = _Py_HashBytes(PyUnicode_DATA(self),
11838 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11839 _PyUnicode_HASH(self) = x;
11840 return x;
11841 }
11842
11843 PyDoc_STRVAR(index__doc__,
11844 "S.index(sub[, start[, end]]) -> int\n\
11845 \n\
11846 Return the lowest index in S where substring sub is found,\n\
11847 such that sub is contained within S[start:end]. Optional\n\
11848 arguments start and end are interpreted as in slice notation.\n\
11849 \n\
11850 Raises ValueError when the substring is not found.");
11851
11852 static PyObject *
unicode_index(PyObject * self,PyObject * args)11853 unicode_index(PyObject *self, PyObject *args)
11854 {
11855 /* initialize variables to prevent gcc warning */
11856 Py_ssize_t result;
11857 PyObject *substring = NULL;
11858 Py_ssize_t start = 0;
11859 Py_ssize_t end = 0;
11860
11861 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11862 return NULL;
11863
11864 if (PyUnicode_READY(self) == -1)
11865 return NULL;
11866
11867 result = any_find_slice(self, substring, start, end, 1);
11868
11869 if (result == -2)
11870 return NULL;
11871
11872 if (result < 0) {
11873 PyErr_SetString(PyExc_ValueError, "substring not found");
11874 return NULL;
11875 }
11876
11877 return PyLong_FromSsize_t(result);
11878 }
11879
11880 /*[clinic input]
11881 str.isascii as unicode_isascii
11882
11883 Return True if all characters in the string are ASCII, False otherwise.
11884
11885 ASCII characters have code points in the range U+0000-U+007F.
11886 Empty string is ASCII too.
11887 [clinic start generated code]*/
11888
11889 static PyObject *
unicode_isascii_impl(PyObject * self)11890 unicode_isascii_impl(PyObject *self)
11891 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11892 {
11893 if (PyUnicode_READY(self) == -1) {
11894 return NULL;
11895 }
11896 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11897 }
11898
11899 /*[clinic input]
11900 str.islower as unicode_islower
11901
11902 Return True if the string is a lowercase string, False otherwise.
11903
11904 A string is lowercase if all cased characters in the string are lowercase and
11905 there is at least one cased character in the string.
11906 [clinic start generated code]*/
11907
11908 static PyObject *
unicode_islower_impl(PyObject * self)11909 unicode_islower_impl(PyObject *self)
11910 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11911 {
11912 Py_ssize_t i, length;
11913 int kind;
11914 const void *data;
11915 int cased;
11916
11917 if (PyUnicode_READY(self) == -1)
11918 return NULL;
11919 length = PyUnicode_GET_LENGTH(self);
11920 kind = PyUnicode_KIND(self);
11921 data = PyUnicode_DATA(self);
11922
11923 /* Shortcut for single character strings */
11924 if (length == 1)
11925 return PyBool_FromLong(
11926 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11927
11928 /* Special case for empty strings */
11929 if (length == 0)
11930 Py_RETURN_FALSE;
11931
11932 cased = 0;
11933 for (i = 0; i < length; i++) {
11934 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11935
11936 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11937 Py_RETURN_FALSE;
11938 else if (!cased && Py_UNICODE_ISLOWER(ch))
11939 cased = 1;
11940 }
11941 return PyBool_FromLong(cased);
11942 }
11943
11944 /*[clinic input]
11945 str.isupper as unicode_isupper
11946
11947 Return True if the string is an uppercase string, False otherwise.
11948
11949 A string is uppercase if all cased characters in the string are uppercase and
11950 there is at least one cased character in the string.
11951 [clinic start generated code]*/
11952
11953 static PyObject *
unicode_isupper_impl(PyObject * self)11954 unicode_isupper_impl(PyObject *self)
11955 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11956 {
11957 Py_ssize_t i, length;
11958 int kind;
11959 const void *data;
11960 int cased;
11961
11962 if (PyUnicode_READY(self) == -1)
11963 return NULL;
11964 length = PyUnicode_GET_LENGTH(self);
11965 kind = PyUnicode_KIND(self);
11966 data = PyUnicode_DATA(self);
11967
11968 /* Shortcut for single character strings */
11969 if (length == 1)
11970 return PyBool_FromLong(
11971 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11972
11973 /* Special case for empty strings */
11974 if (length == 0)
11975 Py_RETURN_FALSE;
11976
11977 cased = 0;
11978 for (i = 0; i < length; i++) {
11979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
11981 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11982 Py_RETURN_FALSE;
11983 else if (!cased && Py_UNICODE_ISUPPER(ch))
11984 cased = 1;
11985 }
11986 return PyBool_FromLong(cased);
11987 }
11988
11989 /*[clinic input]
11990 str.istitle as unicode_istitle
11991
11992 Return True if the string is a title-cased string, False otherwise.
11993
11994 In a title-cased string, upper- and title-case characters may only
11995 follow uncased characters and lowercase characters only cased ones.
11996 [clinic start generated code]*/
11997
11998 static PyObject *
unicode_istitle_impl(PyObject * self)11999 unicode_istitle_impl(PyObject *self)
12000 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
12001 {
12002 Py_ssize_t i, length;
12003 int kind;
12004 const void *data;
12005 int cased, previous_is_cased;
12006
12007 if (PyUnicode_READY(self) == -1)
12008 return NULL;
12009 length = PyUnicode_GET_LENGTH(self);
12010 kind = PyUnicode_KIND(self);
12011 data = PyUnicode_DATA(self);
12012
12013 /* Shortcut for single character strings */
12014 if (length == 1) {
12015 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12016 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12017 (Py_UNICODE_ISUPPER(ch) != 0));
12018 }
12019
12020 /* Special case for empty strings */
12021 if (length == 0)
12022 Py_RETURN_FALSE;
12023
12024 cased = 0;
12025 previous_is_cased = 0;
12026 for (i = 0; i < length; i++) {
12027 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12028
12029 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12030 if (previous_is_cased)
12031 Py_RETURN_FALSE;
12032 previous_is_cased = 1;
12033 cased = 1;
12034 }
12035 else if (Py_UNICODE_ISLOWER(ch)) {
12036 if (!previous_is_cased)
12037 Py_RETURN_FALSE;
12038 previous_is_cased = 1;
12039 cased = 1;
12040 }
12041 else
12042 previous_is_cased = 0;
12043 }
12044 return PyBool_FromLong(cased);
12045 }
12046
12047 /*[clinic input]
12048 str.isspace as unicode_isspace
12049
12050 Return True if the string is a whitespace string, False otherwise.
12051
12052 A string is whitespace if all characters in the string are whitespace and there
12053 is at least one character in the string.
12054 [clinic start generated code]*/
12055
12056 static PyObject *
unicode_isspace_impl(PyObject * self)12057 unicode_isspace_impl(PyObject *self)
12058 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
12059 {
12060 Py_ssize_t i, length;
12061 int kind;
12062 const void *data;
12063
12064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066 length = PyUnicode_GET_LENGTH(self);
12067 kind = PyUnicode_KIND(self);
12068 data = PyUnicode_DATA(self);
12069
12070 /* Shortcut for single character strings */
12071 if (length == 1)
12072 return PyBool_FromLong(
12073 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12074
12075 /* Special case for empty strings */
12076 if (length == 0)
12077 Py_RETURN_FALSE;
12078
12079 for (i = 0; i < length; i++) {
12080 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12081 if (!Py_UNICODE_ISSPACE(ch))
12082 Py_RETURN_FALSE;
12083 }
12084 Py_RETURN_TRUE;
12085 }
12086
12087 /*[clinic input]
12088 str.isalpha as unicode_isalpha
12089
12090 Return True if the string is an alphabetic string, False otherwise.
12091
12092 A string is alphabetic if all characters in the string are alphabetic and there
12093 is at least one character in the string.
12094 [clinic start generated code]*/
12095
12096 static PyObject *
unicode_isalpha_impl(PyObject * self)12097 unicode_isalpha_impl(PyObject *self)
12098 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
12099 {
12100 Py_ssize_t i, length;
12101 int kind;
12102 const void *data;
12103
12104 if (PyUnicode_READY(self) == -1)
12105 return NULL;
12106 length = PyUnicode_GET_LENGTH(self);
12107 kind = PyUnicode_KIND(self);
12108 data = PyUnicode_DATA(self);
12109
12110 /* Shortcut for single character strings */
12111 if (length == 1)
12112 return PyBool_FromLong(
12113 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12114
12115 /* Special case for empty strings */
12116 if (length == 0)
12117 Py_RETURN_FALSE;
12118
12119 for (i = 0; i < length; i++) {
12120 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12121 Py_RETURN_FALSE;
12122 }
12123 Py_RETURN_TRUE;
12124 }
12125
12126 /*[clinic input]
12127 str.isalnum as unicode_isalnum
12128
12129 Return True if the string is an alpha-numeric string, False otherwise.
12130
12131 A string is alpha-numeric if all characters in the string are alpha-numeric and
12132 there is at least one character in the string.
12133 [clinic start generated code]*/
12134
12135 static PyObject *
unicode_isalnum_impl(PyObject * self)12136 unicode_isalnum_impl(PyObject *self)
12137 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
12138 {
12139 int kind;
12140 const void *data;
12141 Py_ssize_t len, i;
12142
12143 if (PyUnicode_READY(self) == -1)
12144 return NULL;
12145
12146 kind = PyUnicode_KIND(self);
12147 data = PyUnicode_DATA(self);
12148 len = PyUnicode_GET_LENGTH(self);
12149
12150 /* Shortcut for single character strings */
12151 if (len == 1) {
12152 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12153 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12154 }
12155
12156 /* Special case for empty strings */
12157 if (len == 0)
12158 Py_RETURN_FALSE;
12159
12160 for (i = 0; i < len; i++) {
12161 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12162 if (!Py_UNICODE_ISALNUM(ch))
12163 Py_RETURN_FALSE;
12164 }
12165 Py_RETURN_TRUE;
12166 }
12167
12168 /*[clinic input]
12169 str.isdecimal as unicode_isdecimal
12170
12171 Return True if the string is a decimal string, False otherwise.
12172
12173 A string is a decimal string if all characters in the string are decimal and
12174 there is at least one character in the string.
12175 [clinic start generated code]*/
12176
12177 static PyObject *
unicode_isdecimal_impl(PyObject * self)12178 unicode_isdecimal_impl(PyObject *self)
12179 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12180 {
12181 Py_ssize_t i, length;
12182 int kind;
12183 const void *data;
12184
12185 if (PyUnicode_READY(self) == -1)
12186 return NULL;
12187 length = PyUnicode_GET_LENGTH(self);
12188 kind = PyUnicode_KIND(self);
12189 data = PyUnicode_DATA(self);
12190
12191 /* Shortcut for single character strings */
12192 if (length == 1)
12193 return PyBool_FromLong(
12194 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12195
12196 /* Special case for empty strings */
12197 if (length == 0)
12198 Py_RETURN_FALSE;
12199
12200 for (i = 0; i < length; i++) {
12201 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12202 Py_RETURN_FALSE;
12203 }
12204 Py_RETURN_TRUE;
12205 }
12206
12207 /*[clinic input]
12208 str.isdigit as unicode_isdigit
12209
12210 Return True if the string is a digit string, False otherwise.
12211
12212 A string is a digit string if all characters in the string are digits and there
12213 is at least one character in the string.
12214 [clinic start generated code]*/
12215
12216 static PyObject *
unicode_isdigit_impl(PyObject * self)12217 unicode_isdigit_impl(PyObject *self)
12218 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12219 {
12220 Py_ssize_t i, length;
12221 int kind;
12222 const void *data;
12223
12224 if (PyUnicode_READY(self) == -1)
12225 return NULL;
12226 length = PyUnicode_GET_LENGTH(self);
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
12229
12230 /* Shortcut for single character strings */
12231 if (length == 1) {
12232 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12233 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12234 }
12235
12236 /* Special case for empty strings */
12237 if (length == 0)
12238 Py_RETURN_FALSE;
12239
12240 for (i = 0; i < length; i++) {
12241 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12242 Py_RETURN_FALSE;
12243 }
12244 Py_RETURN_TRUE;
12245 }
12246
12247 /*[clinic input]
12248 str.isnumeric as unicode_isnumeric
12249
12250 Return True if the string is a numeric string, False otherwise.
12251
12252 A string is numeric if all characters in the string are numeric and there is at
12253 least one character in the string.
12254 [clinic start generated code]*/
12255
12256 static PyObject *
unicode_isnumeric_impl(PyObject * self)12257 unicode_isnumeric_impl(PyObject *self)
12258 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12259 {
12260 Py_ssize_t i, length;
12261 int kind;
12262 const void *data;
12263
12264 if (PyUnicode_READY(self) == -1)
12265 return NULL;
12266 length = PyUnicode_GET_LENGTH(self);
12267 kind = PyUnicode_KIND(self);
12268 data = PyUnicode_DATA(self);
12269
12270 /* Shortcut for single character strings */
12271 if (length == 1)
12272 return PyBool_FromLong(
12273 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12274
12275 /* Special case for empty strings */
12276 if (length == 0)
12277 Py_RETURN_FALSE;
12278
12279 for (i = 0; i < length; i++) {
12280 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12281 Py_RETURN_FALSE;
12282 }
12283 Py_RETURN_TRUE;
12284 }
12285
12286 Py_ssize_t
_PyUnicode_ScanIdentifier(PyObject * self)12287 _PyUnicode_ScanIdentifier(PyObject *self)
12288 {
12289 Py_ssize_t i;
12290 if (PyUnicode_READY(self) == -1)
12291 return -1;
12292
12293 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12294 if (len == 0) {
12295 /* an empty string is not a valid identifier */
12296 return 0;
12297 }
12298
12299 int kind = PyUnicode_KIND(self);
12300 const void *data = PyUnicode_DATA(self);
12301 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12302 /* PEP 3131 says that the first character must be in
12303 XID_Start and subsequent characters in XID_Continue,
12304 and for the ASCII range, the 2.x rules apply (i.e
12305 start with letters and underscore, continue with
12306 letters, digits, underscore). However, given the current
12307 definition of XID_Start and XID_Continue, it is sufficient
12308 to check just for these, except that _ must be allowed
12309 as starting an identifier. */
12310 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12311 return 0;
12312 }
12313
12314 for (i = 1; i < len; i++) {
12315 ch = PyUnicode_READ(kind, data, i);
12316 if (!_PyUnicode_IsXidContinue(ch)) {
12317 return i;
12318 }
12319 }
12320 return i;
12321 }
12322
12323 int
PyUnicode_IsIdentifier(PyObject * self)12324 PyUnicode_IsIdentifier(PyObject *self)
12325 {
12326 if (PyUnicode_IS_READY(self)) {
12327 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12328 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12329 /* an empty string is not a valid identifier */
12330 return len && i == len;
12331 }
12332 else {
12333 _Py_COMP_DIAG_PUSH
12334 _Py_COMP_DIAG_IGNORE_DEPR_DECLS
12335 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
12336 if (len == 0) {
12337 /* an empty string is not a valid identifier */
12338 return 0;
12339 }
12340
12341 const wchar_t *wstr = _PyUnicode_WSTR(self);
12342 Py_UCS4 ch = wstr[i++];
12343 #if SIZEOF_WCHAR_T == 2
12344 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12345 && i < len
12346 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12347 {
12348 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12349 i++;
12350 }
12351 #endif
12352 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12353 return 0;
12354 }
12355
12356 while (i < len) {
12357 ch = wstr[i++];
12358 #if SIZEOF_WCHAR_T == 2
12359 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12360 && i < len
12361 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12362 {
12363 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12364 i++;
12365 }
12366 #endif
12367 if (!_PyUnicode_IsXidContinue(ch)) {
12368 return 0;
12369 }
12370 }
12371 return 1;
12372 _Py_COMP_DIAG_POP
12373 }
12374 }
12375
12376 /*[clinic input]
12377 str.isidentifier as unicode_isidentifier
12378
12379 Return True if the string is a valid Python identifier, False otherwise.
12380
12381 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12382 such as "def" or "class".
12383 [clinic start generated code]*/
12384
12385 static PyObject *
unicode_isidentifier_impl(PyObject * self)12386 unicode_isidentifier_impl(PyObject *self)
12387 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12388 {
12389 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12390 }
12391
12392 /*[clinic input]
12393 str.isprintable as unicode_isprintable
12394
12395 Return True if the string is printable, False otherwise.
12396
12397 A string is printable if all of its characters are considered printable in
12398 repr() or if it is empty.
12399 [clinic start generated code]*/
12400
12401 static PyObject *
unicode_isprintable_impl(PyObject * self)12402 unicode_isprintable_impl(PyObject *self)
12403 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12404 {
12405 Py_ssize_t i, length;
12406 int kind;
12407 const void *data;
12408
12409 if (PyUnicode_READY(self) == -1)
12410 return NULL;
12411 length = PyUnicode_GET_LENGTH(self);
12412 kind = PyUnicode_KIND(self);
12413 data = PyUnicode_DATA(self);
12414
12415 /* Shortcut for single character strings */
12416 if (length == 1)
12417 return PyBool_FromLong(
12418 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12419
12420 for (i = 0; i < length; i++) {
12421 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12422 Py_RETURN_FALSE;
12423 }
12424 }
12425 Py_RETURN_TRUE;
12426 }
12427
12428 /*[clinic input]
12429 str.join as unicode_join
12430
12431 iterable: object
12432 /
12433
12434 Concatenate any number of strings.
12435
12436 The string whose method is called is inserted in between each given string.
12437 The result is returned as a new string.
12438
12439 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12440 [clinic start generated code]*/
12441
12442 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12443 unicode_join(PyObject *self, PyObject *iterable)
12444 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12445 {
12446 return PyUnicode_Join(self, iterable);
12447 }
12448
12449 static Py_ssize_t
unicode_length(PyObject * self)12450 unicode_length(PyObject *self)
12451 {
12452 if (PyUnicode_READY(self) == -1)
12453 return -1;
12454 return PyUnicode_GET_LENGTH(self);
12455 }
12456
12457 /*[clinic input]
12458 str.ljust as unicode_ljust
12459
12460 width: Py_ssize_t
12461 fillchar: Py_UCS4 = ' '
12462 /
12463
12464 Return a left-justified string of length width.
12465
12466 Padding is done using the specified fill character (default is a space).
12467 [clinic start generated code]*/
12468
12469 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12470 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12471 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12472 {
12473 if (PyUnicode_READY(self) == -1)
12474 return NULL;
12475
12476 if (PyUnicode_GET_LENGTH(self) >= width)
12477 return unicode_result_unchanged(self);
12478
12479 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12480 }
12481
12482 /*[clinic input]
12483 str.lower as unicode_lower
12484
12485 Return a copy of the string converted to lowercase.
12486 [clinic start generated code]*/
12487
12488 static PyObject *
unicode_lower_impl(PyObject * self)12489 unicode_lower_impl(PyObject *self)
12490 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12491 {
12492 if (PyUnicode_READY(self) == -1)
12493 return NULL;
12494 if (PyUnicode_IS_ASCII(self))
12495 return ascii_upper_or_lower(self, 1);
12496 return case_operation(self, do_lower);
12497 }
12498
12499 #define LEFTSTRIP 0
12500 #define RIGHTSTRIP 1
12501 #define BOTHSTRIP 2
12502
12503 /* Arrays indexed by above */
12504 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12505
12506 #define STRIPNAME(i) (stripfuncnames[i])
12507
12508 /* externally visible for str.strip(unicode) */
12509 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12510 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12511 {
12512 const void *data;
12513 int kind;
12514 Py_ssize_t i, j, len;
12515 BLOOM_MASK sepmask;
12516 Py_ssize_t seplen;
12517
12518 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12519 return NULL;
12520
12521 kind = PyUnicode_KIND(self);
12522 data = PyUnicode_DATA(self);
12523 len = PyUnicode_GET_LENGTH(self);
12524 seplen = PyUnicode_GET_LENGTH(sepobj);
12525 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12526 PyUnicode_DATA(sepobj),
12527 seplen);
12528
12529 i = 0;
12530 if (striptype != RIGHTSTRIP) {
12531 while (i < len) {
12532 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12533 if (!BLOOM(sepmask, ch))
12534 break;
12535 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12536 break;
12537 i++;
12538 }
12539 }
12540
12541 j = len;
12542 if (striptype != LEFTSTRIP) {
12543 j--;
12544 while (j >= i) {
12545 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12546 if (!BLOOM(sepmask, ch))
12547 break;
12548 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12549 break;
12550 j--;
12551 }
12552
12553 j++;
12554 }
12555
12556 return PyUnicode_Substring(self, i, j);
12557 }
12558
12559 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12560 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12561 {
12562 const unsigned char *data;
12563 int kind;
12564 Py_ssize_t length;
12565
12566 if (PyUnicode_READY(self) == -1)
12567 return NULL;
12568
12569 length = PyUnicode_GET_LENGTH(self);
12570 end = Py_MIN(end, length);
12571
12572 if (start == 0 && end == length)
12573 return unicode_result_unchanged(self);
12574
12575 if (start < 0 || end < 0) {
12576 PyErr_SetString(PyExc_IndexError, "string index out of range");
12577 return NULL;
12578 }
12579 if (start >= length || end < start)
12580 _Py_RETURN_UNICODE_EMPTY();
12581
12582 length = end - start;
12583 if (PyUnicode_IS_ASCII(self)) {
12584 data = PyUnicode_1BYTE_DATA(self);
12585 return _PyUnicode_FromASCII((const char*)(data + start), length);
12586 }
12587 else {
12588 kind = PyUnicode_KIND(self);
12589 data = PyUnicode_1BYTE_DATA(self);
12590 return PyUnicode_FromKindAndData(kind,
12591 data + kind * start,
12592 length);
12593 }
12594 }
12595
12596 static PyObject *
do_strip(PyObject * self,int striptype)12597 do_strip(PyObject *self, int striptype)
12598 {
12599 Py_ssize_t len, i, j;
12600
12601 if (PyUnicode_READY(self) == -1)
12602 return NULL;
12603
12604 len = PyUnicode_GET_LENGTH(self);
12605
12606 if (PyUnicode_IS_ASCII(self)) {
12607 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12608
12609 i = 0;
12610 if (striptype != RIGHTSTRIP) {
12611 while (i < len) {
12612 Py_UCS1 ch = data[i];
12613 if (!_Py_ascii_whitespace[ch])
12614 break;
12615 i++;
12616 }
12617 }
12618
12619 j = len;
12620 if (striptype != LEFTSTRIP) {
12621 j--;
12622 while (j >= i) {
12623 Py_UCS1 ch = data[j];
12624 if (!_Py_ascii_whitespace[ch])
12625 break;
12626 j--;
12627 }
12628 j++;
12629 }
12630 }
12631 else {
12632 int kind = PyUnicode_KIND(self);
12633 const void *data = PyUnicode_DATA(self);
12634
12635 i = 0;
12636 if (striptype != RIGHTSTRIP) {
12637 while (i < len) {
12638 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12639 if (!Py_UNICODE_ISSPACE(ch))
12640 break;
12641 i++;
12642 }
12643 }
12644
12645 j = len;
12646 if (striptype != LEFTSTRIP) {
12647 j--;
12648 while (j >= i) {
12649 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12650 if (!Py_UNICODE_ISSPACE(ch))
12651 break;
12652 j--;
12653 }
12654 j++;
12655 }
12656 }
12657
12658 return PyUnicode_Substring(self, i, j);
12659 }
12660
12661
12662 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12663 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12664 {
12665 if (sep != Py_None) {
12666 if (PyUnicode_Check(sep))
12667 return _PyUnicode_XStrip(self, striptype, sep);
12668 else {
12669 PyErr_Format(PyExc_TypeError,
12670 "%s arg must be None or str",
12671 STRIPNAME(striptype));
12672 return NULL;
12673 }
12674 }
12675
12676 return do_strip(self, striptype);
12677 }
12678
12679
12680 /*[clinic input]
12681 str.strip as unicode_strip
12682
12683 chars: object = None
12684 /
12685
12686 Return a copy of the string with leading and trailing whitespace removed.
12687
12688 If chars is given and not None, remove characters in chars instead.
12689 [clinic start generated code]*/
12690
12691 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12692 unicode_strip_impl(PyObject *self, PyObject *chars)
12693 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12694 {
12695 return do_argstrip(self, BOTHSTRIP, chars);
12696 }
12697
12698
12699 /*[clinic input]
12700 str.lstrip as unicode_lstrip
12701
12702 chars: object = None
12703 /
12704
12705 Return a copy of the string with leading whitespace removed.
12706
12707 If chars is given and not None, remove characters in chars instead.
12708 [clinic start generated code]*/
12709
12710 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12711 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12712 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12713 {
12714 return do_argstrip(self, LEFTSTRIP, chars);
12715 }
12716
12717
12718 /*[clinic input]
12719 str.rstrip as unicode_rstrip
12720
12721 chars: object = None
12722 /
12723
12724 Return a copy of the string with trailing whitespace removed.
12725
12726 If chars is given and not None, remove characters in chars instead.
12727 [clinic start generated code]*/
12728
12729 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12730 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12731 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12732 {
12733 return do_argstrip(self, RIGHTSTRIP, chars);
12734 }
12735
12736
12737 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12738 unicode_repeat(PyObject *str, Py_ssize_t len)
12739 {
12740 PyObject *u;
12741 Py_ssize_t nchars, n;
12742
12743 if (len < 1)
12744 _Py_RETURN_UNICODE_EMPTY();
12745
12746 /* no repeat, return original string */
12747 if (len == 1)
12748 return unicode_result_unchanged(str);
12749
12750 if (PyUnicode_READY(str) == -1)
12751 return NULL;
12752
12753 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12754 PyErr_SetString(PyExc_OverflowError,
12755 "repeated string is too long");
12756 return NULL;
12757 }
12758 nchars = len * PyUnicode_GET_LENGTH(str);
12759
12760 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12761 if (!u)
12762 return NULL;
12763 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12764
12765 if (PyUnicode_GET_LENGTH(str) == 1) {
12766 int kind = PyUnicode_KIND(str);
12767 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12768 if (kind == PyUnicode_1BYTE_KIND) {
12769 void *to = PyUnicode_DATA(u);
12770 memset(to, (unsigned char)fill_char, len);
12771 }
12772 else if (kind == PyUnicode_2BYTE_KIND) {
12773 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12774 for (n = 0; n < len; ++n)
12775 ucs2[n] = fill_char;
12776 } else {
12777 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12778 assert(kind == PyUnicode_4BYTE_KIND);
12779 for (n = 0; n < len; ++n)
12780 ucs4[n] = fill_char;
12781 }
12782 }
12783 else {
12784 Py_ssize_t char_size = PyUnicode_KIND(str);
12785 char *to = (char *) PyUnicode_DATA(u);
12786 _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12787 PyUnicode_GET_LENGTH(str) * char_size);
12788 }
12789
12790 assert(_PyUnicode_CheckConsistency(u, 1));
12791 return u;
12792 }
12793
12794 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12795 PyUnicode_Replace(PyObject *str,
12796 PyObject *substr,
12797 PyObject *replstr,
12798 Py_ssize_t maxcount)
12799 {
12800 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12801 ensure_unicode(replstr) < 0)
12802 return NULL;
12803 return replace(str, substr, replstr, maxcount);
12804 }
12805
12806 /*[clinic input]
12807 str.replace as unicode_replace
12808
12809 old: unicode
12810 new: unicode
12811 count: Py_ssize_t = -1
12812 Maximum number of occurrences to replace.
12813 -1 (the default value) means replace all occurrences.
12814 /
12815
12816 Return a copy with all occurrences of substring old replaced by new.
12817
12818 If the optional argument count is given, only the first count occurrences are
12819 replaced.
12820 [clinic start generated code]*/
12821
12822 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12823 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12824 Py_ssize_t count)
12825 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12826 {
12827 if (PyUnicode_READY(self) == -1)
12828 return NULL;
12829 return replace(self, old, new, count);
12830 }
12831
12832 /*[clinic input]
12833 str.removeprefix as unicode_removeprefix
12834
12835 prefix: unicode
12836 /
12837
12838 Return a str with the given prefix string removed if present.
12839
12840 If the string starts with the prefix string, return string[len(prefix):].
12841 Otherwise, return a copy of the original string.
12842 [clinic start generated code]*/
12843
12844 static PyObject *
unicode_removeprefix_impl(PyObject * self,PyObject * prefix)12845 unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12846 /*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
12847 {
12848 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12849 if (match == -1) {
12850 return NULL;
12851 }
12852 if (match) {
12853 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12854 PyUnicode_GET_LENGTH(self));
12855 }
12856 return unicode_result_unchanged(self);
12857 }
12858
12859 /*[clinic input]
12860 str.removesuffix as unicode_removesuffix
12861
12862 suffix: unicode
12863 /
12864
12865 Return a str with the given suffix string removed if present.
12866
12867 If the string ends with the suffix string and that suffix is not empty,
12868 return string[:-len(suffix)]. Otherwise, return a copy of the original
12869 string.
12870 [clinic start generated code]*/
12871
12872 static PyObject *
unicode_removesuffix_impl(PyObject * self,PyObject * suffix)12873 unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12874 /*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12875 {
12876 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12877 if (match == -1) {
12878 return NULL;
12879 }
12880 if (match) {
12881 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12882 - PyUnicode_GET_LENGTH(suffix));
12883 }
12884 return unicode_result_unchanged(self);
12885 }
12886
12887 static PyObject *
unicode_repr(PyObject * unicode)12888 unicode_repr(PyObject *unicode)
12889 {
12890 PyObject *repr;
12891 Py_ssize_t isize;
12892 Py_ssize_t osize, squote, dquote, i, o;
12893 Py_UCS4 max, quote;
12894 int ikind, okind, unchanged;
12895 const void *idata;
12896 void *odata;
12897
12898 if (PyUnicode_READY(unicode) == -1)
12899 return NULL;
12900
12901 isize = PyUnicode_GET_LENGTH(unicode);
12902 idata = PyUnicode_DATA(unicode);
12903
12904 /* Compute length of output, quote characters, and
12905 maximum character */
12906 osize = 0;
12907 max = 127;
12908 squote = dquote = 0;
12909 ikind = PyUnicode_KIND(unicode);
12910 for (i = 0; i < isize; i++) {
12911 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12912 Py_ssize_t incr = 1;
12913 switch (ch) {
12914 case '\'': squote++; break;
12915 case '"': dquote++; break;
12916 case '\\': case '\t': case '\r': case '\n':
12917 incr = 2;
12918 break;
12919 default:
12920 /* Fast-path ASCII */
12921 if (ch < ' ' || ch == 0x7f)
12922 incr = 4; /* \xHH */
12923 else if (ch < 0x7f)
12924 ;
12925 else if (Py_UNICODE_ISPRINTABLE(ch))
12926 max = ch > max ? ch : max;
12927 else if (ch < 0x100)
12928 incr = 4; /* \xHH */
12929 else if (ch < 0x10000)
12930 incr = 6; /* \uHHHH */
12931 else
12932 incr = 10; /* \uHHHHHHHH */
12933 }
12934 if (osize > PY_SSIZE_T_MAX - incr) {
12935 PyErr_SetString(PyExc_OverflowError,
12936 "string is too long to generate repr");
12937 return NULL;
12938 }
12939 osize += incr;
12940 }
12941
12942 quote = '\'';
12943 unchanged = (osize == isize);
12944 if (squote) {
12945 unchanged = 0;
12946 if (dquote)
12947 /* Both squote and dquote present. Use squote,
12948 and escape them */
12949 osize += squote;
12950 else
12951 quote = '"';
12952 }
12953 osize += 2; /* quotes */
12954
12955 repr = PyUnicode_New(osize, max);
12956 if (repr == NULL)
12957 return NULL;
12958 okind = PyUnicode_KIND(repr);
12959 odata = PyUnicode_DATA(repr);
12960
12961 PyUnicode_WRITE(okind, odata, 0, quote);
12962 PyUnicode_WRITE(okind, odata, osize-1, quote);
12963 if (unchanged) {
12964 _PyUnicode_FastCopyCharacters(repr, 1,
12965 unicode, 0,
12966 isize);
12967 }
12968 else {
12969 for (i = 0, o = 1; i < isize; i++) {
12970 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12971
12972 /* Escape quotes and backslashes */
12973 if ((ch == quote) || (ch == '\\')) {
12974 PyUnicode_WRITE(okind, odata, o++, '\\');
12975 PyUnicode_WRITE(okind, odata, o++, ch);
12976 continue;
12977 }
12978
12979 /* Map special whitespace to '\t', \n', '\r' */
12980 if (ch == '\t') {
12981 PyUnicode_WRITE(okind, odata, o++, '\\');
12982 PyUnicode_WRITE(okind, odata, o++, 't');
12983 }
12984 else if (ch == '\n') {
12985 PyUnicode_WRITE(okind, odata, o++, '\\');
12986 PyUnicode_WRITE(okind, odata, o++, 'n');
12987 }
12988 else if (ch == '\r') {
12989 PyUnicode_WRITE(okind, odata, o++, '\\');
12990 PyUnicode_WRITE(okind, odata, o++, 'r');
12991 }
12992
12993 /* Map non-printable US ASCII to '\xhh' */
12994 else if (ch < ' ' || ch == 0x7F) {
12995 PyUnicode_WRITE(okind, odata, o++, '\\');
12996 PyUnicode_WRITE(okind, odata, o++, 'x');
12997 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12998 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12999 }
13000
13001 /* Copy ASCII characters as-is */
13002 else if (ch < 0x7F) {
13003 PyUnicode_WRITE(okind, odata, o++, ch);
13004 }
13005
13006 /* Non-ASCII characters */
13007 else {
13008 /* Map Unicode whitespace and control characters
13009 (categories Z* and C* except ASCII space)
13010 */
13011 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13012 PyUnicode_WRITE(okind, odata, o++, '\\');
13013 /* Map 8-bit characters to '\xhh' */
13014 if (ch <= 0xff) {
13015 PyUnicode_WRITE(okind, odata, o++, 'x');
13016 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13017 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13018 }
13019 /* Map 16-bit characters to '\uxxxx' */
13020 else if (ch <= 0xffff) {
13021 PyUnicode_WRITE(okind, odata, o++, 'u');
13022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13026 }
13027 /* Map 21-bit characters to '\U00xxxxxx' */
13028 else {
13029 PyUnicode_WRITE(okind, odata, o++, 'U');
13030 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13033 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13038 }
13039 }
13040 /* Copy characters as-is */
13041 else {
13042 PyUnicode_WRITE(okind, odata, o++, ch);
13043 }
13044 }
13045 }
13046 }
13047 /* Closing quote already added at the beginning */
13048 assert(_PyUnicode_CheckConsistency(repr, 1));
13049 return repr;
13050 }
13051
13052 PyDoc_STRVAR(rfind__doc__,
13053 "S.rfind(sub[, start[, end]]) -> int\n\
13054 \n\
13055 Return the highest index in S where substring sub is found,\n\
13056 such that sub is contained within S[start:end]. Optional\n\
13057 arguments start and end are interpreted as in slice notation.\n\
13058 \n\
13059 Return -1 on failure.");
13060
13061 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)13062 unicode_rfind(PyObject *self, PyObject *args)
13063 {
13064 /* initialize variables to prevent gcc warning */
13065 PyObject *substring = NULL;
13066 Py_ssize_t start = 0;
13067 Py_ssize_t end = 0;
13068 Py_ssize_t result;
13069
13070 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
13071 return NULL;
13072
13073 if (PyUnicode_READY(self) == -1)
13074 return NULL;
13075
13076 result = any_find_slice(self, substring, start, end, -1);
13077
13078 if (result == -2)
13079 return NULL;
13080
13081 return PyLong_FromSsize_t(result);
13082 }
13083
13084 PyDoc_STRVAR(rindex__doc__,
13085 "S.rindex(sub[, start[, end]]) -> int\n\
13086 \n\
13087 Return the highest index in S where substring sub is found,\n\
13088 such that sub is contained within S[start:end]. Optional\n\
13089 arguments start and end are interpreted as in slice notation.\n\
13090 \n\
13091 Raises ValueError when the substring is not found.");
13092
13093 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)13094 unicode_rindex(PyObject *self, PyObject *args)
13095 {
13096 /* initialize variables to prevent gcc warning */
13097 PyObject *substring = NULL;
13098 Py_ssize_t start = 0;
13099 Py_ssize_t end = 0;
13100 Py_ssize_t result;
13101
13102 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
13103 return NULL;
13104
13105 if (PyUnicode_READY(self) == -1)
13106 return NULL;
13107
13108 result = any_find_slice(self, substring, start, end, -1);
13109
13110 if (result == -2)
13111 return NULL;
13112
13113 if (result < 0) {
13114 PyErr_SetString(PyExc_ValueError, "substring not found");
13115 return NULL;
13116 }
13117
13118 return PyLong_FromSsize_t(result);
13119 }
13120
13121 /*[clinic input]
13122 str.rjust as unicode_rjust
13123
13124 width: Py_ssize_t
13125 fillchar: Py_UCS4 = ' '
13126 /
13127
13128 Return a right-justified string of length width.
13129
13130 Padding is done using the specified fill character (default is a space).
13131 [clinic start generated code]*/
13132
13133 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)13134 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13135 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
13136 {
13137 if (PyUnicode_READY(self) == -1)
13138 return NULL;
13139
13140 if (PyUnicode_GET_LENGTH(self) >= width)
13141 return unicode_result_unchanged(self);
13142
13143 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
13144 }
13145
13146 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13147 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13148 {
13149 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13150 return NULL;
13151
13152 return split(s, sep, maxsplit);
13153 }
13154
13155 /*[clinic input]
13156 str.split as unicode_split
13157
13158 sep: object = None
13159 The separator used to split the string.
13160
13161 When set to None (the default value), will split on any whitespace
13162 character (including \\n \\r \\t \\f and spaces) and will discard
13163 empty strings from the result.
13164 maxsplit: Py_ssize_t = -1
13165 Maximum number of splits (starting from the left).
13166 -1 (the default value) means no limit.
13167
13168 Return a list of the substrings in the string, using sep as the separator string.
13169
13170 Note, str.split() is mainly useful for data that has been intentionally
13171 delimited. With natural text that includes punctuation, consider using
13172 the regular expression module.
13173
13174 [clinic start generated code]*/
13175
13176 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13177 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13178 /*[clinic end generated code: output=3a65b1db356948dc input=906d953b44efc43b]*/
13179 {
13180 if (sep == Py_None)
13181 return split(self, NULL, maxsplit);
13182 if (PyUnicode_Check(sep))
13183 return split(self, sep, maxsplit);
13184
13185 PyErr_Format(PyExc_TypeError,
13186 "must be str or None, not %.100s",
13187 Py_TYPE(sep)->tp_name);
13188 return NULL;
13189 }
13190
13191 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)13192 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
13193 {
13194 PyObject* out;
13195 int kind1, kind2;
13196 const void *buf1, *buf2;
13197 Py_ssize_t len1, len2;
13198
13199 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13200 return NULL;
13201
13202 kind1 = PyUnicode_KIND(str_obj);
13203 kind2 = PyUnicode_KIND(sep_obj);
13204 len1 = PyUnicode_GET_LENGTH(str_obj);
13205 len2 = PyUnicode_GET_LENGTH(sep_obj);
13206 if (kind1 < kind2 || len1 < len2) {
13207 PyObject *empty = unicode_get_empty(); // Borrowed reference
13208 return PyTuple_Pack(3, str_obj, empty, empty);
13209 }
13210 buf1 = PyUnicode_DATA(str_obj);
13211 buf2 = PyUnicode_DATA(sep_obj);
13212 if (kind2 != kind1) {
13213 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13214 if (!buf2)
13215 return NULL;
13216 }
13217
13218 switch (kind1) {
13219 case PyUnicode_1BYTE_KIND:
13220 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13221 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13222 else
13223 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13224 break;
13225 case PyUnicode_2BYTE_KIND:
13226 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13227 break;
13228 case PyUnicode_4BYTE_KIND:
13229 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13230 break;
13231 default:
13232 Py_UNREACHABLE();
13233 }
13234
13235 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13236 if (kind2 != kind1)
13237 PyMem_Free((void *)buf2);
13238
13239 return out;
13240 }
13241
13242
13243 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)13244 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13245 {
13246 PyObject* out;
13247 int kind1, kind2;
13248 const void *buf1, *buf2;
13249 Py_ssize_t len1, len2;
13250
13251 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13252 return NULL;
13253
13254 kind1 = PyUnicode_KIND(str_obj);
13255 kind2 = PyUnicode_KIND(sep_obj);
13256 len1 = PyUnicode_GET_LENGTH(str_obj);
13257 len2 = PyUnicode_GET_LENGTH(sep_obj);
13258 if (kind1 < kind2 || len1 < len2) {
13259 PyObject *empty = unicode_get_empty(); // Borrowed reference
13260 return PyTuple_Pack(3, empty, empty, str_obj);
13261 }
13262 buf1 = PyUnicode_DATA(str_obj);
13263 buf2 = PyUnicode_DATA(sep_obj);
13264 if (kind2 != kind1) {
13265 buf2 = unicode_askind(kind2, buf2, len2, kind1);
13266 if (!buf2)
13267 return NULL;
13268 }
13269
13270 switch (kind1) {
13271 case PyUnicode_1BYTE_KIND:
13272 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13273 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13274 else
13275 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13276 break;
13277 case PyUnicode_2BYTE_KIND:
13278 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13279 break;
13280 case PyUnicode_4BYTE_KIND:
13281 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13282 break;
13283 default:
13284 Py_UNREACHABLE();
13285 }
13286
13287 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13288 if (kind2 != kind1)
13289 PyMem_Free((void *)buf2);
13290
13291 return out;
13292 }
13293
13294 /*[clinic input]
13295 str.partition as unicode_partition
13296
13297 sep: object
13298 /
13299
13300 Partition the string into three parts using the given separator.
13301
13302 This will search for the separator in the string. If the separator is found,
13303 returns a 3-tuple containing the part before the separator, the separator
13304 itself, and the part after it.
13305
13306 If the separator is not found, returns a 3-tuple containing the original string
13307 and two empty strings.
13308 [clinic start generated code]*/
13309
13310 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13311 unicode_partition(PyObject *self, PyObject *sep)
13312 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13313 {
13314 return PyUnicode_Partition(self, sep);
13315 }
13316
13317 /*[clinic input]
13318 str.rpartition as unicode_rpartition = str.partition
13319
13320 Partition the string into three parts using the given separator.
13321
13322 This will search for the separator in the string, starting at the end. If
13323 the separator is found, returns a 3-tuple containing the part before the
13324 separator, the separator itself, and the part after it.
13325
13326 If the separator is not found, returns a 3-tuple containing two empty strings
13327 and the original string.
13328 [clinic start generated code]*/
13329
13330 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13331 unicode_rpartition(PyObject *self, PyObject *sep)
13332 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13333 {
13334 return PyUnicode_RPartition(self, sep);
13335 }
13336
13337 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13338 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13339 {
13340 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13341 return NULL;
13342
13343 return rsplit(s, sep, maxsplit);
13344 }
13345
13346 /*[clinic input]
13347 str.rsplit as unicode_rsplit = str.split
13348
13349 Return a list of the substrings in the string, using sep as the separator string.
13350
13351 Splitting starts at the end of the string and works to the front.
13352 [clinic start generated code]*/
13353
13354 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13355 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13356 /*[clinic end generated code: output=c2b815c63bcabffc input=ea78406060fce33c]*/
13357 {
13358 if (sep == Py_None)
13359 return rsplit(self, NULL, maxsplit);
13360 if (PyUnicode_Check(sep))
13361 return rsplit(self, sep, maxsplit);
13362
13363 PyErr_Format(PyExc_TypeError,
13364 "must be str or None, not %.100s",
13365 Py_TYPE(sep)->tp_name);
13366 return NULL;
13367 }
13368
13369 /*[clinic input]
13370 str.splitlines as unicode_splitlines
13371
13372 keepends: bool(accept={int}) = False
13373
13374 Return a list of the lines in the string, breaking at line boundaries.
13375
13376 Line breaks are not included in the resulting list unless keepends is given and
13377 true.
13378 [clinic start generated code]*/
13379
13380 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13381 unicode_splitlines_impl(PyObject *self, int keepends)
13382 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13383 {
13384 return PyUnicode_Splitlines(self, keepends);
13385 }
13386
13387 static
unicode_str(PyObject * self)13388 PyObject *unicode_str(PyObject *self)
13389 {
13390 return unicode_result_unchanged(self);
13391 }
13392
13393 /*[clinic input]
13394 str.swapcase as unicode_swapcase
13395
13396 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13397 [clinic start generated code]*/
13398
13399 static PyObject *
unicode_swapcase_impl(PyObject * self)13400 unicode_swapcase_impl(PyObject *self)
13401 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13402 {
13403 if (PyUnicode_READY(self) == -1)
13404 return NULL;
13405 return case_operation(self, do_swapcase);
13406 }
13407
13408 /*[clinic input]
13409
13410 @staticmethod
13411 str.maketrans as unicode_maketrans
13412
13413 x: object
13414
13415 y: unicode=NULL
13416
13417 z: unicode=NULL
13418
13419 /
13420
13421 Return a translation table usable for str.translate().
13422
13423 If there is only one argument, it must be a dictionary mapping Unicode
13424 ordinals (integers) or characters to Unicode ordinals, strings or None.
13425 Character keys will be then converted to ordinals.
13426 If there are two arguments, they must be strings of equal length, and
13427 in the resulting dictionary, each character in x will be mapped to the
13428 character at the same position in y. If there is a third argument, it
13429 must be a string, whose characters will be mapped to None in the result.
13430 [clinic start generated code]*/
13431
13432 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13433 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13434 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13435 {
13436 PyObject *new = NULL, *key, *value;
13437 Py_ssize_t i = 0;
13438 int res;
13439
13440 new = PyDict_New();
13441 if (!new)
13442 return NULL;
13443 if (y != NULL) {
13444 int x_kind, y_kind, z_kind;
13445 const void *x_data, *y_data, *z_data;
13446
13447 /* x must be a string too, of equal length */
13448 if (!PyUnicode_Check(x)) {
13449 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13450 "be a string if there is a second argument");
13451 goto err;
13452 }
13453 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13454 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13455 "arguments must have equal length");
13456 goto err;
13457 }
13458 /* create entries for translating chars in x to those in y */
13459 x_kind = PyUnicode_KIND(x);
13460 y_kind = PyUnicode_KIND(y);
13461 x_data = PyUnicode_DATA(x);
13462 y_data = PyUnicode_DATA(y);
13463 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13464 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13465 if (!key)
13466 goto err;
13467 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13468 if (!value) {
13469 Py_DECREF(key);
13470 goto err;
13471 }
13472 res = PyDict_SetItem(new, key, value);
13473 Py_DECREF(key);
13474 Py_DECREF(value);
13475 if (res < 0)
13476 goto err;
13477 }
13478 /* create entries for deleting chars in z */
13479 if (z != NULL) {
13480 z_kind = PyUnicode_KIND(z);
13481 z_data = PyUnicode_DATA(z);
13482 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13483 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13484 if (!key)
13485 goto err;
13486 res = PyDict_SetItem(new, key, Py_None);
13487 Py_DECREF(key);
13488 if (res < 0)
13489 goto err;
13490 }
13491 }
13492 } else {
13493 int kind;
13494 const void *data;
13495
13496 /* x must be a dict */
13497 if (!PyDict_CheckExact(x)) {
13498 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13499 "to maketrans it must be a dict");
13500 goto err;
13501 }
13502 /* copy entries into the new dict, converting string keys to int keys */
13503 while (PyDict_Next(x, &i, &key, &value)) {
13504 if (PyUnicode_Check(key)) {
13505 /* convert string keys to integer keys */
13506 PyObject *newkey;
13507 if (PyUnicode_GET_LENGTH(key) != 1) {
13508 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13509 "table must be of length 1");
13510 goto err;
13511 }
13512 kind = PyUnicode_KIND(key);
13513 data = PyUnicode_DATA(key);
13514 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13515 if (!newkey)
13516 goto err;
13517 res = PyDict_SetItem(new, newkey, value);
13518 Py_DECREF(newkey);
13519 if (res < 0)
13520 goto err;
13521 } else if (PyLong_Check(key)) {
13522 /* just keep integer keys */
13523 if (PyDict_SetItem(new, key, value) < 0)
13524 goto err;
13525 } else {
13526 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13527 "be strings or integers");
13528 goto err;
13529 }
13530 }
13531 }
13532 return new;
13533 err:
13534 Py_DECREF(new);
13535 return NULL;
13536 }
13537
13538 /*[clinic input]
13539 str.translate as unicode_translate
13540
13541 table: object
13542 Translation table, which must be a mapping of Unicode ordinals to
13543 Unicode ordinals, strings, or None.
13544 /
13545
13546 Replace each character in the string using the given translation table.
13547
13548 The table must implement lookup/indexing via __getitem__, for instance a
13549 dictionary or list. If this operation raises LookupError, the character is
13550 left untouched. Characters mapped to None are deleted.
13551 [clinic start generated code]*/
13552
13553 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13554 unicode_translate(PyObject *self, PyObject *table)
13555 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13556 {
13557 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13558 }
13559
13560 /*[clinic input]
13561 str.upper as unicode_upper
13562
13563 Return a copy of the string converted to uppercase.
13564 [clinic start generated code]*/
13565
13566 static PyObject *
unicode_upper_impl(PyObject * self)13567 unicode_upper_impl(PyObject *self)
13568 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13569 {
13570 if (PyUnicode_READY(self) == -1)
13571 return NULL;
13572 if (PyUnicode_IS_ASCII(self))
13573 return ascii_upper_or_lower(self, 0);
13574 return case_operation(self, do_upper);
13575 }
13576
13577 /*[clinic input]
13578 str.zfill as unicode_zfill
13579
13580 width: Py_ssize_t
13581 /
13582
13583 Pad a numeric string with zeros on the left, to fill a field of the given width.
13584
13585 The string is never truncated.
13586 [clinic start generated code]*/
13587
13588 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13589 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13590 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13591 {
13592 Py_ssize_t fill;
13593 PyObject *u;
13594 int kind;
13595 const void *data;
13596 Py_UCS4 chr;
13597
13598 if (PyUnicode_READY(self) == -1)
13599 return NULL;
13600
13601 if (PyUnicode_GET_LENGTH(self) >= width)
13602 return unicode_result_unchanged(self);
13603
13604 fill = width - PyUnicode_GET_LENGTH(self);
13605
13606 u = pad(self, fill, 0, '0');
13607
13608 if (u == NULL)
13609 return NULL;
13610
13611 kind = PyUnicode_KIND(u);
13612 data = PyUnicode_DATA(u);
13613 chr = PyUnicode_READ(kind, data, fill);
13614
13615 if (chr == '+' || chr == '-') {
13616 /* move sign to beginning of string */
13617 PyUnicode_WRITE(kind, data, 0, chr);
13618 PyUnicode_WRITE(kind, data, fill, '0');
13619 }
13620
13621 assert(_PyUnicode_CheckConsistency(u, 1));
13622 return u;
13623 }
13624
13625 PyDoc_STRVAR(startswith__doc__,
13626 "S.startswith(prefix[, start[, end]]) -> bool\n\
13627 \n\
13628 Return True if S starts with the specified prefix, False otherwise.\n\
13629 With optional start, test S beginning at that position.\n\
13630 With optional end, stop comparing S at that position.\n\
13631 prefix can also be a tuple of strings to try.");
13632
13633 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13634 unicode_startswith(PyObject *self,
13635 PyObject *args)
13636 {
13637 PyObject *subobj;
13638 PyObject *substring;
13639 Py_ssize_t start = 0;
13640 Py_ssize_t end = PY_SSIZE_T_MAX;
13641 int result;
13642
13643 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13644 return NULL;
13645 if (PyTuple_Check(subobj)) {
13646 Py_ssize_t i;
13647 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13648 substring = PyTuple_GET_ITEM(subobj, i);
13649 if (!PyUnicode_Check(substring)) {
13650 PyErr_Format(PyExc_TypeError,
13651 "tuple for startswith must only contain str, "
13652 "not %.100s",
13653 Py_TYPE(substring)->tp_name);
13654 return NULL;
13655 }
13656 result = tailmatch(self, substring, start, end, -1);
13657 if (result == -1)
13658 return NULL;
13659 if (result) {
13660 Py_RETURN_TRUE;
13661 }
13662 }
13663 /* nothing matched */
13664 Py_RETURN_FALSE;
13665 }
13666 if (!PyUnicode_Check(subobj)) {
13667 PyErr_Format(PyExc_TypeError,
13668 "startswith first arg must be str or "
13669 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13670 return NULL;
13671 }
13672 result = tailmatch(self, subobj, start, end, -1);
13673 if (result == -1)
13674 return NULL;
13675 return PyBool_FromLong(result);
13676 }
13677
13678
13679 PyDoc_STRVAR(endswith__doc__,
13680 "S.endswith(suffix[, start[, end]]) -> bool\n\
13681 \n\
13682 Return True if S ends with the specified suffix, False otherwise.\n\
13683 With optional start, test S beginning at that position.\n\
13684 With optional end, stop comparing S at that position.\n\
13685 suffix can also be a tuple of strings to try.");
13686
13687 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13688 unicode_endswith(PyObject *self,
13689 PyObject *args)
13690 {
13691 PyObject *subobj;
13692 PyObject *substring;
13693 Py_ssize_t start = 0;
13694 Py_ssize_t end = PY_SSIZE_T_MAX;
13695 int result;
13696
13697 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13698 return NULL;
13699 if (PyTuple_Check(subobj)) {
13700 Py_ssize_t i;
13701 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13702 substring = PyTuple_GET_ITEM(subobj, i);
13703 if (!PyUnicode_Check(substring)) {
13704 PyErr_Format(PyExc_TypeError,
13705 "tuple for endswith must only contain str, "
13706 "not %.100s",
13707 Py_TYPE(substring)->tp_name);
13708 return NULL;
13709 }
13710 result = tailmatch(self, substring, start, end, +1);
13711 if (result == -1)
13712 return NULL;
13713 if (result) {
13714 Py_RETURN_TRUE;
13715 }
13716 }
13717 Py_RETURN_FALSE;
13718 }
13719 if (!PyUnicode_Check(subobj)) {
13720 PyErr_Format(PyExc_TypeError,
13721 "endswith first arg must be str or "
13722 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13723 return NULL;
13724 }
13725 result = tailmatch(self, subobj, start, end, +1);
13726 if (result == -1)
13727 return NULL;
13728 return PyBool_FromLong(result);
13729 }
13730
13731 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13732 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13733 {
13734 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13735 writer->data = PyUnicode_DATA(writer->buffer);
13736
13737 if (!writer->readonly) {
13738 writer->kind = PyUnicode_KIND(writer->buffer);
13739 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13740 }
13741 else {
13742 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13743 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13744 writer->kind = PyUnicode_WCHAR_KIND;
13745 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13746
13747 /* Copy-on-write mode: set buffer size to 0 so
13748 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13749 * next write. */
13750 writer->size = 0;
13751 }
13752 }
13753
13754 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13755 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13756 {
13757 memset(writer, 0, sizeof(*writer));
13758
13759 /* ASCII is the bare minimum */
13760 writer->min_char = 127;
13761
13762 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13763 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13764 writer->kind = PyUnicode_WCHAR_KIND;
13765 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13766 }
13767
13768 // Initialize _PyUnicodeWriter with initial buffer
13769 static inline void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter * writer,PyObject * buffer)13770 _PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13771 {
13772 memset(writer, 0, sizeof(*writer));
13773 writer->buffer = buffer;
13774 _PyUnicodeWriter_Update(writer);
13775 writer->min_length = writer->size;
13776 }
13777
13778 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13779 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13780 Py_ssize_t length, Py_UCS4 maxchar)
13781 {
13782 Py_ssize_t newlen;
13783 PyObject *newbuffer;
13784
13785 assert(maxchar <= MAX_UNICODE);
13786
13787 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13788 assert((maxchar > writer->maxchar && length >= 0)
13789 || length > 0);
13790
13791 if (length > PY_SSIZE_T_MAX - writer->pos) {
13792 PyErr_NoMemory();
13793 return -1;
13794 }
13795 newlen = writer->pos + length;
13796
13797 maxchar = Py_MAX(maxchar, writer->min_char);
13798
13799 if (writer->buffer == NULL) {
13800 assert(!writer->readonly);
13801 if (writer->overallocate
13802 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13803 /* overallocate to limit the number of realloc() */
13804 newlen += newlen / OVERALLOCATE_FACTOR;
13805 }
13806 if (newlen < writer->min_length)
13807 newlen = writer->min_length;
13808
13809 writer->buffer = PyUnicode_New(newlen, maxchar);
13810 if (writer->buffer == NULL)
13811 return -1;
13812 }
13813 else if (newlen > writer->size) {
13814 if (writer->overallocate
13815 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13816 /* overallocate to limit the number of realloc() */
13817 newlen += newlen / OVERALLOCATE_FACTOR;
13818 }
13819 if (newlen < writer->min_length)
13820 newlen = writer->min_length;
13821
13822 if (maxchar > writer->maxchar || writer->readonly) {
13823 /* resize + widen */
13824 maxchar = Py_MAX(maxchar, writer->maxchar);
13825 newbuffer = PyUnicode_New(newlen, maxchar);
13826 if (newbuffer == NULL)
13827 return -1;
13828 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13829 writer->buffer, 0, writer->pos);
13830 Py_DECREF(writer->buffer);
13831 writer->readonly = 0;
13832 }
13833 else {
13834 newbuffer = resize_compact(writer->buffer, newlen);
13835 if (newbuffer == NULL)
13836 return -1;
13837 }
13838 writer->buffer = newbuffer;
13839 }
13840 else if (maxchar > writer->maxchar) {
13841 assert(!writer->readonly);
13842 newbuffer = PyUnicode_New(writer->size, maxchar);
13843 if (newbuffer == NULL)
13844 return -1;
13845 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13846 writer->buffer, 0, writer->pos);
13847 Py_SETREF(writer->buffer, newbuffer);
13848 }
13849 _PyUnicodeWriter_Update(writer);
13850 return 0;
13851
13852 #undef OVERALLOCATE_FACTOR
13853 }
13854
13855 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13856 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13857 enum PyUnicode_Kind kind)
13858 {
13859 Py_UCS4 maxchar;
13860
13861 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13862 assert(writer->kind < kind);
13863
13864 switch (kind)
13865 {
13866 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13867 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13868 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13869 default:
13870 Py_UNREACHABLE();
13871 }
13872
13873 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13874 }
13875
13876 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13877 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13878 {
13879 assert(ch <= MAX_UNICODE);
13880 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13881 return -1;
13882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13883 writer->pos++;
13884 return 0;
13885 }
13886
13887 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13888 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13889 {
13890 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13891 }
13892
13893 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13894 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13895 {
13896 Py_UCS4 maxchar;
13897 Py_ssize_t len;
13898
13899 if (PyUnicode_READY(str) == -1)
13900 return -1;
13901 len = PyUnicode_GET_LENGTH(str);
13902 if (len == 0)
13903 return 0;
13904 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13905 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13906 if (writer->buffer == NULL && !writer->overallocate) {
13907 assert(_PyUnicode_CheckConsistency(str, 1));
13908 writer->readonly = 1;
13909 Py_INCREF(str);
13910 writer->buffer = str;
13911 _PyUnicodeWriter_Update(writer);
13912 writer->pos += len;
13913 return 0;
13914 }
13915 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13916 return -1;
13917 }
13918 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13919 str, 0, len);
13920 writer->pos += len;
13921 return 0;
13922 }
13923
13924 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13925 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13926 Py_ssize_t start, Py_ssize_t end)
13927 {
13928 Py_UCS4 maxchar;
13929 Py_ssize_t len;
13930
13931 if (PyUnicode_READY(str) == -1)
13932 return -1;
13933
13934 assert(0 <= start);
13935 assert(end <= PyUnicode_GET_LENGTH(str));
13936 assert(start <= end);
13937
13938 if (end == 0)
13939 return 0;
13940
13941 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13942 return _PyUnicodeWriter_WriteStr(writer, str);
13943
13944 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13945 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13946 else
13947 maxchar = writer->maxchar;
13948 len = end - start;
13949
13950 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13951 return -1;
13952
13953 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13954 str, start, len);
13955 writer->pos += len;
13956 return 0;
13957 }
13958
13959 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13960 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13961 const char *ascii, Py_ssize_t len)
13962 {
13963 if (len == -1)
13964 len = strlen(ascii);
13965
13966 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13967
13968 if (writer->buffer == NULL && !writer->overallocate) {
13969 PyObject *str;
13970
13971 str = _PyUnicode_FromASCII(ascii, len);
13972 if (str == NULL)
13973 return -1;
13974
13975 writer->readonly = 1;
13976 writer->buffer = str;
13977 _PyUnicodeWriter_Update(writer);
13978 writer->pos += len;
13979 return 0;
13980 }
13981
13982 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13983 return -1;
13984
13985 switch (writer->kind)
13986 {
13987 case PyUnicode_1BYTE_KIND:
13988 {
13989 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13990 Py_UCS1 *data = writer->data;
13991
13992 memcpy(data + writer->pos, str, len);
13993 break;
13994 }
13995 case PyUnicode_2BYTE_KIND:
13996 {
13997 _PyUnicode_CONVERT_BYTES(
13998 Py_UCS1, Py_UCS2,
13999 ascii, ascii + len,
14000 (Py_UCS2 *)writer->data + writer->pos);
14001 break;
14002 }
14003 case PyUnicode_4BYTE_KIND:
14004 {
14005 _PyUnicode_CONVERT_BYTES(
14006 Py_UCS1, Py_UCS4,
14007 ascii, ascii + len,
14008 (Py_UCS4 *)writer->data + writer->pos);
14009 break;
14010 }
14011 default:
14012 Py_UNREACHABLE();
14013 }
14014
14015 writer->pos += len;
14016 return 0;
14017 }
14018
14019 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)14020 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14021 const char *str, Py_ssize_t len)
14022 {
14023 Py_UCS4 maxchar;
14024
14025 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14026 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14027 return -1;
14028 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14029 writer->pos += len;
14030 return 0;
14031 }
14032
14033 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)14034 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14035 {
14036 PyObject *str;
14037
14038 if (writer->pos == 0) {
14039 Py_CLEAR(writer->buffer);
14040 _Py_RETURN_UNICODE_EMPTY();
14041 }
14042
14043 str = writer->buffer;
14044 writer->buffer = NULL;
14045
14046 if (writer->readonly) {
14047 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14048 return str;
14049 }
14050
14051 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14052 PyObject *str2;
14053 str2 = resize_compact(str, writer->pos);
14054 if (str2 == NULL) {
14055 Py_DECREF(str);
14056 return NULL;
14057 }
14058 str = str2;
14059 }
14060
14061 assert(_PyUnicode_CheckConsistency(str, 1));
14062 return unicode_result_ready(str);
14063 }
14064
14065 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)14066 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14067 {
14068 Py_CLEAR(writer->buffer);
14069 }
14070
14071 #include "stringlib/unicode_format.h"
14072
14073 PyDoc_STRVAR(format__doc__,
14074 "S.format(*args, **kwargs) -> str\n\
14075 \n\
14076 Return a formatted version of S, using substitutions from args and kwargs.\n\
14077 The substitutions are identified by braces ('{' and '}').");
14078
14079 PyDoc_STRVAR(format_map__doc__,
14080 "S.format_map(mapping) -> str\n\
14081 \n\
14082 Return a formatted version of S, using substitutions from mapping.\n\
14083 The substitutions are identified by braces ('{' and '}').");
14084
14085 /*[clinic input]
14086 str.__format__ as unicode___format__
14087
14088 format_spec: unicode
14089 /
14090
14091 Return a formatted version of the string as described by format_spec.
14092 [clinic start generated code]*/
14093
14094 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)14095 unicode___format___impl(PyObject *self, PyObject *format_spec)
14096 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14097 {
14098 _PyUnicodeWriter writer;
14099 int ret;
14100
14101 if (PyUnicode_READY(self) == -1)
14102 return NULL;
14103 _PyUnicodeWriter_Init(&writer);
14104 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14105 self, format_spec, 0,
14106 PyUnicode_GET_LENGTH(format_spec));
14107 if (ret == -1) {
14108 _PyUnicodeWriter_Dealloc(&writer);
14109 return NULL;
14110 }
14111 return _PyUnicodeWriter_Finish(&writer);
14112 }
14113
14114 /*[clinic input]
14115 str.__sizeof__ as unicode_sizeof
14116
14117 Return the size of the string in memory, in bytes.
14118 [clinic start generated code]*/
14119
14120 static PyObject *
unicode_sizeof_impl(PyObject * self)14121 unicode_sizeof_impl(PyObject *self)
14122 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14123 {
14124 Py_ssize_t size;
14125
14126 /* If it's a compact object, account for base structure +
14127 character data. */
14128 if (PyUnicode_IS_COMPACT_ASCII(self))
14129 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14130 else if (PyUnicode_IS_COMPACT(self))
14131 size = sizeof(PyCompactUnicodeObject) +
14132 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14133 else {
14134 /* If it is a two-block object, account for base object, and
14135 for character block if present. */
14136 size = sizeof(PyUnicodeObject);
14137 if (_PyUnicode_DATA_ANY(self))
14138 size += (PyUnicode_GET_LENGTH(self) + 1) *
14139 PyUnicode_KIND(self);
14140 }
14141 /* If the wstr pointer is present, account for it unless it is shared
14142 with the data pointer. Check if the data is not shared. */
14143 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14144 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14145 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14146 size += PyUnicode_UTF8_LENGTH(self) + 1;
14147
14148 return PyLong_FromSsize_t(size);
14149 }
14150
14151 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))14152 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14153 {
14154 PyObject *copy = _PyUnicode_Copy(v);
14155 if (!copy)
14156 return NULL;
14157 return Py_BuildValue("(N)", copy);
14158 }
14159
14160 static PyMethodDef unicode_methods[] = {
14161 UNICODE_ENCODE_METHODDEF
14162 UNICODE_REPLACE_METHODDEF
14163 UNICODE_SPLIT_METHODDEF
14164 UNICODE_RSPLIT_METHODDEF
14165 UNICODE_JOIN_METHODDEF
14166 UNICODE_CAPITALIZE_METHODDEF
14167 UNICODE_CASEFOLD_METHODDEF
14168 UNICODE_TITLE_METHODDEF
14169 UNICODE_CENTER_METHODDEF
14170 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
14171 UNICODE_EXPANDTABS_METHODDEF
14172 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
14173 UNICODE_PARTITION_METHODDEF
14174 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
14175 UNICODE_LJUST_METHODDEF
14176 UNICODE_LOWER_METHODDEF
14177 UNICODE_LSTRIP_METHODDEF
14178 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14179 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
14180 UNICODE_RJUST_METHODDEF
14181 UNICODE_RSTRIP_METHODDEF
14182 UNICODE_RPARTITION_METHODDEF
14183 UNICODE_SPLITLINES_METHODDEF
14184 UNICODE_STRIP_METHODDEF
14185 UNICODE_SWAPCASE_METHODDEF
14186 UNICODE_TRANSLATE_METHODDEF
14187 UNICODE_UPPER_METHODDEF
14188 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14189 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
14190 UNICODE_REMOVEPREFIX_METHODDEF
14191 UNICODE_REMOVESUFFIX_METHODDEF
14192 UNICODE_ISASCII_METHODDEF
14193 UNICODE_ISLOWER_METHODDEF
14194 UNICODE_ISUPPER_METHODDEF
14195 UNICODE_ISTITLE_METHODDEF
14196 UNICODE_ISSPACE_METHODDEF
14197 UNICODE_ISDECIMAL_METHODDEF
14198 UNICODE_ISDIGIT_METHODDEF
14199 UNICODE_ISNUMERIC_METHODDEF
14200 UNICODE_ISALPHA_METHODDEF
14201 UNICODE_ISALNUM_METHODDEF
14202 UNICODE_ISIDENTIFIER_METHODDEF
14203 UNICODE_ISPRINTABLE_METHODDEF
14204 UNICODE_ZFILL_METHODDEF
14205 {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14206 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
14207 UNICODE___FORMAT___METHODDEF
14208 UNICODE_MAKETRANS_METHODDEF
14209 UNICODE_SIZEOF_METHODDEF
14210 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
14211 {NULL, NULL}
14212 };
14213
14214 static PyObject *
unicode_mod(PyObject * v,PyObject * w)14215 unicode_mod(PyObject *v, PyObject *w)
14216 {
14217 if (!PyUnicode_Check(v))
14218 Py_RETURN_NOTIMPLEMENTED;
14219 return PyUnicode_Format(v, w);
14220 }
14221
14222 static PyNumberMethods unicode_as_number = {
14223 0, /*nb_add*/
14224 0, /*nb_subtract*/
14225 0, /*nb_multiply*/
14226 unicode_mod, /*nb_remainder*/
14227 };
14228
14229 static PySequenceMethods unicode_as_sequence = {
14230 (lenfunc) unicode_length, /* sq_length */
14231 PyUnicode_Concat, /* sq_concat */
14232 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14233 (ssizeargfunc) unicode_getitem, /* sq_item */
14234 0, /* sq_slice */
14235 0, /* sq_ass_item */
14236 0, /* sq_ass_slice */
14237 PyUnicode_Contains, /* sq_contains */
14238 };
14239
14240 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)14241 unicode_subscript(PyObject* self, PyObject* item)
14242 {
14243 if (PyUnicode_READY(self) == -1)
14244 return NULL;
14245
14246 if (_PyIndex_Check(item)) {
14247 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14248 if (i == -1 && PyErr_Occurred())
14249 return NULL;
14250 if (i < 0)
14251 i += PyUnicode_GET_LENGTH(self);
14252 return unicode_getitem(self, i);
14253 } else if (PySlice_Check(item)) {
14254 Py_ssize_t start, stop, step, slicelength, i;
14255 size_t cur;
14256 PyObject *result;
14257 const void *src_data;
14258 void *dest_data;
14259 int src_kind, dest_kind;
14260 Py_UCS4 ch, max_char, kind_limit;
14261
14262 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14263 return NULL;
14264 }
14265 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14266 &start, &stop, step);
14267
14268 if (slicelength <= 0) {
14269 _Py_RETURN_UNICODE_EMPTY();
14270 } else if (start == 0 && step == 1 &&
14271 slicelength == PyUnicode_GET_LENGTH(self)) {
14272 return unicode_result_unchanged(self);
14273 } else if (step == 1) {
14274 return PyUnicode_Substring(self,
14275 start, start + slicelength);
14276 }
14277 /* General case */
14278 src_kind = PyUnicode_KIND(self);
14279 src_data = PyUnicode_DATA(self);
14280 if (!PyUnicode_IS_ASCII(self)) {
14281 kind_limit = kind_maxchar_limit(src_kind);
14282 max_char = 0;
14283 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14284 ch = PyUnicode_READ(src_kind, src_data, cur);
14285 if (ch > max_char) {
14286 max_char = ch;
14287 if (max_char >= kind_limit)
14288 break;
14289 }
14290 }
14291 }
14292 else
14293 max_char = 127;
14294 result = PyUnicode_New(slicelength, max_char);
14295 if (result == NULL)
14296 return NULL;
14297 dest_kind = PyUnicode_KIND(result);
14298 dest_data = PyUnicode_DATA(result);
14299
14300 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14301 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14302 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14303 }
14304 assert(_PyUnicode_CheckConsistency(result, 1));
14305 return result;
14306 } else {
14307 PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14308 Py_TYPE(item)->tp_name);
14309 return NULL;
14310 }
14311 }
14312
14313 static PyMappingMethods unicode_as_mapping = {
14314 (lenfunc)unicode_length, /* mp_length */
14315 (binaryfunc)unicode_subscript, /* mp_subscript */
14316 (objobjargproc)0, /* mp_ass_subscript */
14317 };
14318
14319
14320 /* Helpers for PyUnicode_Format() */
14321
14322 struct unicode_formatter_t {
14323 PyObject *args;
14324 int args_owned;
14325 Py_ssize_t arglen, argidx;
14326 PyObject *dict;
14327
14328 enum PyUnicode_Kind fmtkind;
14329 Py_ssize_t fmtcnt, fmtpos;
14330 const void *fmtdata;
14331 PyObject *fmtstr;
14332
14333 _PyUnicodeWriter writer;
14334 };
14335
14336 struct unicode_format_arg_t {
14337 Py_UCS4 ch;
14338 int flags;
14339 Py_ssize_t width;
14340 int prec;
14341 int sign;
14342 };
14343
14344 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14345 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14346 {
14347 Py_ssize_t argidx = ctx->argidx;
14348
14349 if (argidx < ctx->arglen) {
14350 ctx->argidx++;
14351 if (ctx->arglen < 0)
14352 return ctx->args;
14353 else
14354 return PyTuple_GetItem(ctx->args, argidx);
14355 }
14356 PyErr_SetString(PyExc_TypeError,
14357 "not enough arguments for format string");
14358 return NULL;
14359 }
14360
14361 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14362
14363 /* Format a float into the writer if the writer is not NULL, or into *p_output
14364 otherwise.
14365
14366 Return 0 on success, raise an exception and return -1 on error. */
14367 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14368 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14369 PyObject **p_output,
14370 _PyUnicodeWriter *writer)
14371 {
14372 char *p;
14373 double x;
14374 Py_ssize_t len;
14375 int prec;
14376 int dtoa_flags = 0;
14377
14378 x = PyFloat_AsDouble(v);
14379 if (x == -1.0 && PyErr_Occurred())
14380 return -1;
14381
14382 prec = arg->prec;
14383 if (prec < 0)
14384 prec = 6;
14385
14386 if (arg->flags & F_ALT)
14387 dtoa_flags |= Py_DTSF_ALT;
14388 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14389 if (p == NULL)
14390 return -1;
14391 len = strlen(p);
14392 if (writer) {
14393 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14394 PyMem_Free(p);
14395 return -1;
14396 }
14397 }
14398 else
14399 *p_output = _PyUnicode_FromASCII(p, len);
14400 PyMem_Free(p);
14401 return 0;
14402 }
14403
14404 /* formatlong() emulates the format codes d, u, o, x and X, and
14405 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14406 * Python's regular ints.
14407 * Return value: a new PyUnicodeObject*, or NULL if error.
14408 * The output string is of the form
14409 * "-"? ("0x" | "0X")? digit+
14410 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14411 * set in flags. The case of hex digits will be correct,
14412 * There will be at least prec digits, zero-filled on the left if
14413 * necessary to get that many.
14414 * val object to be converted
14415 * flags bitmask of format flags; only F_ALT is looked at
14416 * prec minimum number of digits; 0-fill on left if needed
14417 * type a character in [duoxX]; u acts the same as d
14418 *
14419 * CAUTION: o, x and X conversions on regular ints can never
14420 * produce a '-' sign, but can for Python's unbounded ints.
14421 */
14422 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14423 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14424 {
14425 PyObject *result = NULL;
14426 char *buf;
14427 Py_ssize_t i;
14428 int sign; /* 1 if '-', else 0 */
14429 int len; /* number of characters */
14430 Py_ssize_t llen;
14431 int numdigits; /* len == numnondigits + numdigits */
14432 int numnondigits = 0;
14433
14434 /* Avoid exceeding SSIZE_T_MAX */
14435 if (prec > INT_MAX-3) {
14436 PyErr_SetString(PyExc_OverflowError,
14437 "precision too large");
14438 return NULL;
14439 }
14440
14441 assert(PyLong_Check(val));
14442
14443 switch (type) {
14444 default:
14445 Py_UNREACHABLE();
14446 case 'd':
14447 case 'i':
14448 case 'u':
14449 /* int and int subclasses should print numerically when a numeric */
14450 /* format code is used (see issue18780) */
14451 result = PyNumber_ToBase(val, 10);
14452 break;
14453 case 'o':
14454 numnondigits = 2;
14455 result = PyNumber_ToBase(val, 8);
14456 break;
14457 case 'x':
14458 case 'X':
14459 numnondigits = 2;
14460 result = PyNumber_ToBase(val, 16);
14461 break;
14462 }
14463 if (!result)
14464 return NULL;
14465
14466 assert(unicode_modifiable(result));
14467 assert(PyUnicode_IS_READY(result));
14468 assert(PyUnicode_IS_ASCII(result));
14469
14470 /* To modify the string in-place, there can only be one reference. */
14471 if (Py_REFCNT(result) != 1) {
14472 Py_DECREF(result);
14473 PyErr_BadInternalCall();
14474 return NULL;
14475 }
14476 buf = PyUnicode_DATA(result);
14477 llen = PyUnicode_GET_LENGTH(result);
14478 if (llen > INT_MAX) {
14479 Py_DECREF(result);
14480 PyErr_SetString(PyExc_ValueError,
14481 "string too large in _PyUnicode_FormatLong");
14482 return NULL;
14483 }
14484 len = (int)llen;
14485 sign = buf[0] == '-';
14486 numnondigits += sign;
14487 numdigits = len - numnondigits;
14488 assert(numdigits > 0);
14489
14490 /* Get rid of base marker unless F_ALT */
14491 if (((alt) == 0 &&
14492 (type == 'o' || type == 'x' || type == 'X'))) {
14493 assert(buf[sign] == '0');
14494 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14495 buf[sign+1] == 'o');
14496 numnondigits -= 2;
14497 buf += 2;
14498 len -= 2;
14499 if (sign)
14500 buf[0] = '-';
14501 assert(len == numnondigits + numdigits);
14502 assert(numdigits > 0);
14503 }
14504
14505 /* Fill with leading zeroes to meet minimum width. */
14506 if (prec > numdigits) {
14507 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14508 numnondigits + prec);
14509 char *b1;
14510 if (!r1) {
14511 Py_DECREF(result);
14512 return NULL;
14513 }
14514 b1 = PyBytes_AS_STRING(r1);
14515 for (i = 0; i < numnondigits; ++i)
14516 *b1++ = *buf++;
14517 for (i = 0; i < prec - numdigits; i++)
14518 *b1++ = '0';
14519 for (i = 0; i < numdigits; i++)
14520 *b1++ = *buf++;
14521 *b1 = '\0';
14522 Py_DECREF(result);
14523 result = r1;
14524 buf = PyBytes_AS_STRING(result);
14525 len = numnondigits + prec;
14526 }
14527
14528 /* Fix up case for hex conversions. */
14529 if (type == 'X') {
14530 /* Need to convert all lower case letters to upper case.
14531 and need to convert 0x to 0X (and -0x to -0X). */
14532 for (i = 0; i < len; i++)
14533 if (buf[i] >= 'a' && buf[i] <= 'x')
14534 buf[i] -= 'a'-'A';
14535 }
14536 if (!PyUnicode_Check(result)
14537 || buf != PyUnicode_DATA(result)) {
14538 PyObject *unicode;
14539 unicode = _PyUnicode_FromASCII(buf, len);
14540 Py_DECREF(result);
14541 result = unicode;
14542 }
14543 else if (len != PyUnicode_GET_LENGTH(result)) {
14544 if (PyUnicode_Resize(&result, len) < 0)
14545 Py_CLEAR(result);
14546 }
14547 return result;
14548 }
14549
14550 /* Format an integer or a float as an integer.
14551 * Return 1 if the number has been formatted into the writer,
14552 * 0 if the number has been formatted into *p_output
14553 * -1 and raise an exception on error */
14554 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14555 mainformatlong(PyObject *v,
14556 struct unicode_format_arg_t *arg,
14557 PyObject **p_output,
14558 _PyUnicodeWriter *writer)
14559 {
14560 PyObject *iobj, *res;
14561 char type = (char)arg->ch;
14562
14563 if (!PyNumber_Check(v))
14564 goto wrongtype;
14565
14566 /* make sure number is a type of integer for o, x, and X */
14567 if (!PyLong_Check(v)) {
14568 if (type == 'o' || type == 'x' || type == 'X') {
14569 iobj = _PyNumber_Index(v);
14570 }
14571 else {
14572 iobj = PyNumber_Long(v);
14573 }
14574 if (iobj == NULL ) {
14575 if (PyErr_ExceptionMatches(PyExc_TypeError))
14576 goto wrongtype;
14577 return -1;
14578 }
14579 assert(PyLong_Check(iobj));
14580 }
14581 else {
14582 iobj = v;
14583 Py_INCREF(iobj);
14584 }
14585
14586 if (PyLong_CheckExact(v)
14587 && arg->width == -1 && arg->prec == -1
14588 && !(arg->flags & (F_SIGN | F_BLANK))
14589 && type != 'X')
14590 {
14591 /* Fast path */
14592 int alternate = arg->flags & F_ALT;
14593 int base;
14594
14595 switch(type)
14596 {
14597 default:
14598 Py_UNREACHABLE();
14599 case 'd':
14600 case 'i':
14601 case 'u':
14602 base = 10;
14603 break;
14604 case 'o':
14605 base = 8;
14606 break;
14607 case 'x':
14608 case 'X':
14609 base = 16;
14610 break;
14611 }
14612
14613 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14614 Py_DECREF(iobj);
14615 return -1;
14616 }
14617 Py_DECREF(iobj);
14618 return 1;
14619 }
14620
14621 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14622 Py_DECREF(iobj);
14623 if (res == NULL)
14624 return -1;
14625 *p_output = res;
14626 return 0;
14627
14628 wrongtype:
14629 switch(type)
14630 {
14631 case 'o':
14632 case 'x':
14633 case 'X':
14634 PyErr_Format(PyExc_TypeError,
14635 "%%%c format: an integer is required, "
14636 "not %.200s",
14637 type, Py_TYPE(v)->tp_name);
14638 break;
14639 default:
14640 PyErr_Format(PyExc_TypeError,
14641 "%%%c format: a real number is required, "
14642 "not %.200s",
14643 type, Py_TYPE(v)->tp_name);
14644 break;
14645 }
14646 return -1;
14647 }
14648
14649 static Py_UCS4
formatchar(PyObject * v)14650 formatchar(PyObject *v)
14651 {
14652 /* presume that the buffer is at least 3 characters long */
14653 if (PyUnicode_Check(v)) {
14654 if (PyUnicode_GET_LENGTH(v) == 1) {
14655 return PyUnicode_READ_CHAR(v, 0);
14656 }
14657 goto onError;
14658 }
14659 else {
14660 int overflow;
14661 long x = PyLong_AsLongAndOverflow(v, &overflow);
14662 if (x == -1 && PyErr_Occurred()) {
14663 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14664 goto onError;
14665 }
14666 return (Py_UCS4) -1;
14667 }
14668
14669 if (x < 0 || x > MAX_UNICODE) {
14670 /* this includes an overflow in converting to C long */
14671 PyErr_SetString(PyExc_OverflowError,
14672 "%c arg not in range(0x110000)");
14673 return (Py_UCS4) -1;
14674 }
14675
14676 return (Py_UCS4) x;
14677 }
14678
14679 onError:
14680 PyErr_SetString(PyExc_TypeError,
14681 "%c requires int or char");
14682 return (Py_UCS4) -1;
14683 }
14684
14685 /* Parse options of an argument: flags, width, precision.
14686 Handle also "%(name)" syntax.
14687
14688 Return 0 if the argument has been formatted into arg->str.
14689 Return 1 if the argument has been written into ctx->writer,
14690 Raise an exception and return -1 on error. */
14691 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14692 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14693 struct unicode_format_arg_t *arg)
14694 {
14695 #define FORMAT_READ(ctx) \
14696 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14697
14698 PyObject *v;
14699
14700 if (arg->ch == '(') {
14701 /* Get argument value from a dictionary. Example: "%(name)s". */
14702 Py_ssize_t keystart;
14703 Py_ssize_t keylen;
14704 PyObject *key;
14705 int pcount = 1;
14706
14707 if (ctx->dict == NULL) {
14708 PyErr_SetString(PyExc_TypeError,
14709 "format requires a mapping");
14710 return -1;
14711 }
14712 ++ctx->fmtpos;
14713 --ctx->fmtcnt;
14714 keystart = ctx->fmtpos;
14715 /* Skip over balanced parentheses */
14716 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14717 arg->ch = FORMAT_READ(ctx);
14718 if (arg->ch == ')')
14719 --pcount;
14720 else if (arg->ch == '(')
14721 ++pcount;
14722 ctx->fmtpos++;
14723 }
14724 keylen = ctx->fmtpos - keystart - 1;
14725 if (ctx->fmtcnt < 0 || pcount > 0) {
14726 PyErr_SetString(PyExc_ValueError,
14727 "incomplete format key");
14728 return -1;
14729 }
14730 key = PyUnicode_Substring(ctx->fmtstr,
14731 keystart, keystart + keylen);
14732 if (key == NULL)
14733 return -1;
14734 if (ctx->args_owned) {
14735 ctx->args_owned = 0;
14736 Py_DECREF(ctx->args);
14737 }
14738 ctx->args = PyObject_GetItem(ctx->dict, key);
14739 Py_DECREF(key);
14740 if (ctx->args == NULL)
14741 return -1;
14742 ctx->args_owned = 1;
14743 ctx->arglen = -1;
14744 ctx->argidx = -2;
14745 }
14746
14747 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14748 while (--ctx->fmtcnt >= 0) {
14749 arg->ch = FORMAT_READ(ctx);
14750 ctx->fmtpos++;
14751 switch (arg->ch) {
14752 case '-': arg->flags |= F_LJUST; continue;
14753 case '+': arg->flags |= F_SIGN; continue;
14754 case ' ': arg->flags |= F_BLANK; continue;
14755 case '#': arg->flags |= F_ALT; continue;
14756 case '0': arg->flags |= F_ZERO; continue;
14757 }
14758 break;
14759 }
14760
14761 /* Parse width. Example: "%10s" => width=10 */
14762 if (arg->ch == '*') {
14763 v = unicode_format_getnextarg(ctx);
14764 if (v == NULL)
14765 return -1;
14766 if (!PyLong_Check(v)) {
14767 PyErr_SetString(PyExc_TypeError,
14768 "* wants int");
14769 return -1;
14770 }
14771 arg->width = PyLong_AsSsize_t(v);
14772 if (arg->width == -1 && PyErr_Occurred())
14773 return -1;
14774 if (arg->width < 0) {
14775 arg->flags |= F_LJUST;
14776 arg->width = -arg->width;
14777 }
14778 if (--ctx->fmtcnt >= 0) {
14779 arg->ch = FORMAT_READ(ctx);
14780 ctx->fmtpos++;
14781 }
14782 }
14783 else if (arg->ch >= '0' && arg->ch <= '9') {
14784 arg->width = arg->ch - '0';
14785 while (--ctx->fmtcnt >= 0) {
14786 arg->ch = FORMAT_READ(ctx);
14787 ctx->fmtpos++;
14788 if (arg->ch < '0' || arg->ch > '9')
14789 break;
14790 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14791 mixing signed and unsigned comparison. Since arg->ch is between
14792 '0' and '9', casting to int is safe. */
14793 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14794 PyErr_SetString(PyExc_ValueError,
14795 "width too big");
14796 return -1;
14797 }
14798 arg->width = arg->width*10 + (arg->ch - '0');
14799 }
14800 }
14801
14802 /* Parse precision. Example: "%.3f" => prec=3 */
14803 if (arg->ch == '.') {
14804 arg->prec = 0;
14805 if (--ctx->fmtcnt >= 0) {
14806 arg->ch = FORMAT_READ(ctx);
14807 ctx->fmtpos++;
14808 }
14809 if (arg->ch == '*') {
14810 v = unicode_format_getnextarg(ctx);
14811 if (v == NULL)
14812 return -1;
14813 if (!PyLong_Check(v)) {
14814 PyErr_SetString(PyExc_TypeError,
14815 "* wants int");
14816 return -1;
14817 }
14818 arg->prec = _PyLong_AsInt(v);
14819 if (arg->prec == -1 && PyErr_Occurred())
14820 return -1;
14821 if (arg->prec < 0)
14822 arg->prec = 0;
14823 if (--ctx->fmtcnt >= 0) {
14824 arg->ch = FORMAT_READ(ctx);
14825 ctx->fmtpos++;
14826 }
14827 }
14828 else if (arg->ch >= '0' && arg->ch <= '9') {
14829 arg->prec = arg->ch - '0';
14830 while (--ctx->fmtcnt >= 0) {
14831 arg->ch = FORMAT_READ(ctx);
14832 ctx->fmtpos++;
14833 if (arg->ch < '0' || arg->ch > '9')
14834 break;
14835 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14836 PyErr_SetString(PyExc_ValueError,
14837 "precision too big");
14838 return -1;
14839 }
14840 arg->prec = arg->prec*10 + (arg->ch - '0');
14841 }
14842 }
14843 }
14844
14845 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14846 if (ctx->fmtcnt >= 0) {
14847 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14848 if (--ctx->fmtcnt >= 0) {
14849 arg->ch = FORMAT_READ(ctx);
14850 ctx->fmtpos++;
14851 }
14852 }
14853 }
14854 if (ctx->fmtcnt < 0) {
14855 PyErr_SetString(PyExc_ValueError,
14856 "incomplete format");
14857 return -1;
14858 }
14859 return 0;
14860
14861 #undef FORMAT_READ
14862 }
14863
14864 /* Format one argument. Supported conversion specifiers:
14865
14866 - "s", "r", "a": any type
14867 - "i", "d", "u": int or float
14868 - "o", "x", "X": int
14869 - "e", "E", "f", "F", "g", "G": float
14870 - "c": int or str (1 character)
14871
14872 When possible, the output is written directly into the Unicode writer
14873 (ctx->writer). A string is created when padding is required.
14874
14875 Return 0 if the argument has been formatted into *p_str,
14876 1 if the argument has been written into ctx->writer,
14877 -1 on error. */
14878 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14879 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14880 struct unicode_format_arg_t *arg,
14881 PyObject **p_str)
14882 {
14883 PyObject *v;
14884 _PyUnicodeWriter *writer = &ctx->writer;
14885
14886 if (ctx->fmtcnt == 0)
14887 ctx->writer.overallocate = 0;
14888
14889 v = unicode_format_getnextarg(ctx);
14890 if (v == NULL)
14891 return -1;
14892
14893
14894 switch (arg->ch) {
14895 case 's':
14896 case 'r':
14897 case 'a':
14898 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14899 /* Fast path */
14900 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14901 return -1;
14902 return 1;
14903 }
14904
14905 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14906 *p_str = v;
14907 Py_INCREF(*p_str);
14908 }
14909 else {
14910 if (arg->ch == 's')
14911 *p_str = PyObject_Str(v);
14912 else if (arg->ch == 'r')
14913 *p_str = PyObject_Repr(v);
14914 else
14915 *p_str = PyObject_ASCII(v);
14916 }
14917 break;
14918
14919 case 'i':
14920 case 'd':
14921 case 'u':
14922 case 'o':
14923 case 'x':
14924 case 'X':
14925 {
14926 int ret = mainformatlong(v, arg, p_str, writer);
14927 if (ret != 0)
14928 return ret;
14929 arg->sign = 1;
14930 break;
14931 }
14932
14933 case 'e':
14934 case 'E':
14935 case 'f':
14936 case 'F':
14937 case 'g':
14938 case 'G':
14939 if (arg->width == -1 && arg->prec == -1
14940 && !(arg->flags & (F_SIGN | F_BLANK)))
14941 {
14942 /* Fast path */
14943 if (formatfloat(v, arg, NULL, writer) == -1)
14944 return -1;
14945 return 1;
14946 }
14947
14948 arg->sign = 1;
14949 if (formatfloat(v, arg, p_str, NULL) == -1)
14950 return -1;
14951 break;
14952
14953 case 'c':
14954 {
14955 Py_UCS4 ch = formatchar(v);
14956 if (ch == (Py_UCS4) -1)
14957 return -1;
14958 if (arg->width == -1 && arg->prec == -1) {
14959 /* Fast path */
14960 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14961 return -1;
14962 return 1;
14963 }
14964 *p_str = PyUnicode_FromOrdinal(ch);
14965 break;
14966 }
14967
14968 default:
14969 PyErr_Format(PyExc_ValueError,
14970 "unsupported format character '%c' (0x%x) "
14971 "at index %zd",
14972 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14973 (int)arg->ch,
14974 ctx->fmtpos - 1);
14975 return -1;
14976 }
14977 if (*p_str == NULL)
14978 return -1;
14979 assert (PyUnicode_Check(*p_str));
14980 return 0;
14981 }
14982
14983 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14984 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14985 struct unicode_format_arg_t *arg,
14986 PyObject *str)
14987 {
14988 Py_ssize_t len;
14989 enum PyUnicode_Kind kind;
14990 const void *pbuf;
14991 Py_ssize_t pindex;
14992 Py_UCS4 signchar;
14993 Py_ssize_t buflen;
14994 Py_UCS4 maxchar;
14995 Py_ssize_t sublen;
14996 _PyUnicodeWriter *writer = &ctx->writer;
14997 Py_UCS4 fill;
14998
14999 fill = ' ';
15000 if (arg->sign && arg->flags & F_ZERO)
15001 fill = '0';
15002
15003 if (PyUnicode_READY(str) == -1)
15004 return -1;
15005
15006 len = PyUnicode_GET_LENGTH(str);
15007 if ((arg->width == -1 || arg->width <= len)
15008 && (arg->prec == -1 || arg->prec >= len)
15009 && !(arg->flags & (F_SIGN | F_BLANK)))
15010 {
15011 /* Fast path */
15012 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15013 return -1;
15014 return 0;
15015 }
15016
15017 /* Truncate the string for "s", "r" and "a" formats
15018 if the precision is set */
15019 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15020 if (arg->prec >= 0 && len > arg->prec)
15021 len = arg->prec;
15022 }
15023
15024 /* Adjust sign and width */
15025 kind = PyUnicode_KIND(str);
15026 pbuf = PyUnicode_DATA(str);
15027 pindex = 0;
15028 signchar = '\0';
15029 if (arg->sign) {
15030 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15031 if (ch == '-' || ch == '+') {
15032 signchar = ch;
15033 len--;
15034 pindex++;
15035 }
15036 else if (arg->flags & F_SIGN)
15037 signchar = '+';
15038 else if (arg->flags & F_BLANK)
15039 signchar = ' ';
15040 else
15041 arg->sign = 0;
15042 }
15043 if (arg->width < len)
15044 arg->width = len;
15045
15046 /* Prepare the writer */
15047 maxchar = writer->maxchar;
15048 if (!(arg->flags & F_LJUST)) {
15049 if (arg->sign) {
15050 if ((arg->width-1) > len)
15051 maxchar = Py_MAX(maxchar, fill);
15052 }
15053 else {
15054 if (arg->width > len)
15055 maxchar = Py_MAX(maxchar, fill);
15056 }
15057 }
15058 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15059 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15060 maxchar = Py_MAX(maxchar, strmaxchar);
15061 }
15062
15063 buflen = arg->width;
15064 if (arg->sign && len == arg->width)
15065 buflen++;
15066 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15067 return -1;
15068
15069 /* Write the sign if needed */
15070 if (arg->sign) {
15071 if (fill != ' ') {
15072 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15073 writer->pos += 1;
15074 }
15075 if (arg->width > len)
15076 arg->width--;
15077 }
15078
15079 /* Write the numeric prefix for "x", "X" and "o" formats
15080 if the alternate form is used.
15081 For example, write "0x" for the "%#x" format. */
15082 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15083 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15084 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15085 if (fill != ' ') {
15086 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15087 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15088 writer->pos += 2;
15089 pindex += 2;
15090 }
15091 arg->width -= 2;
15092 if (arg->width < 0)
15093 arg->width = 0;
15094 len -= 2;
15095 }
15096
15097 /* Pad left with the fill character if needed */
15098 if (arg->width > len && !(arg->flags & F_LJUST)) {
15099 sublen = arg->width - len;
15100 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
15101 writer->pos += sublen;
15102 arg->width = len;
15103 }
15104
15105 /* If padding with spaces: write sign if needed and/or numeric prefix if
15106 the alternate form is used */
15107 if (fill == ' ') {
15108 if (arg->sign) {
15109 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15110 writer->pos += 1;
15111 }
15112 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15113 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15114 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15115 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15116 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15117 writer->pos += 2;
15118 pindex += 2;
15119 }
15120 }
15121
15122 /* Write characters */
15123 if (len) {
15124 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15125 str, pindex, len);
15126 writer->pos += len;
15127 }
15128
15129 /* Pad right with the fill character if needed */
15130 if (arg->width > len) {
15131 sublen = arg->width - len;
15132 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15133 writer->pos += sublen;
15134 }
15135 return 0;
15136 }
15137
15138 /* Helper of PyUnicode_Format(): format one arg.
15139 Return 0 on success, raise an exception and return -1 on error. */
15140 static int
unicode_format_arg(struct unicode_formatter_t * ctx)15141 unicode_format_arg(struct unicode_formatter_t *ctx)
15142 {
15143 struct unicode_format_arg_t arg;
15144 PyObject *str;
15145 int ret;
15146
15147 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15148 if (arg.ch == '%') {
15149 ctx->fmtpos++;
15150 ctx->fmtcnt--;
15151 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15152 return -1;
15153 return 0;
15154 }
15155 arg.flags = 0;
15156 arg.width = -1;
15157 arg.prec = -1;
15158 arg.sign = 0;
15159 str = NULL;
15160
15161 ret = unicode_format_arg_parse(ctx, &arg);
15162 if (ret == -1)
15163 return -1;
15164
15165 ret = unicode_format_arg_format(ctx, &arg, &str);
15166 if (ret == -1)
15167 return -1;
15168
15169 if (ret != 1) {
15170 ret = unicode_format_arg_output(ctx, &arg, str);
15171 Py_DECREF(str);
15172 if (ret == -1)
15173 return -1;
15174 }
15175
15176 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15177 PyErr_SetString(PyExc_TypeError,
15178 "not all arguments converted during string formatting");
15179 return -1;
15180 }
15181 return 0;
15182 }
15183
15184 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)15185 PyUnicode_Format(PyObject *format, PyObject *args)
15186 {
15187 struct unicode_formatter_t ctx;
15188
15189 if (format == NULL || args == NULL) {
15190 PyErr_BadInternalCall();
15191 return NULL;
15192 }
15193
15194 if (ensure_unicode(format) < 0)
15195 return NULL;
15196
15197 ctx.fmtstr = format;
15198 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15199 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15200 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15201 ctx.fmtpos = 0;
15202
15203 _PyUnicodeWriter_Init(&ctx.writer);
15204 ctx.writer.min_length = ctx.fmtcnt + 100;
15205 ctx.writer.overallocate = 1;
15206
15207 if (PyTuple_Check(args)) {
15208 ctx.arglen = PyTuple_Size(args);
15209 ctx.argidx = 0;
15210 }
15211 else {
15212 ctx.arglen = -1;
15213 ctx.argidx = -2;
15214 }
15215 ctx.args_owned = 0;
15216 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15217 ctx.dict = args;
15218 else
15219 ctx.dict = NULL;
15220 ctx.args = args;
15221
15222 while (--ctx.fmtcnt >= 0) {
15223 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15224 Py_ssize_t nonfmtpos;
15225
15226 nonfmtpos = ctx.fmtpos++;
15227 while (ctx.fmtcnt >= 0 &&
15228 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15229 ctx.fmtpos++;
15230 ctx.fmtcnt--;
15231 }
15232 if (ctx.fmtcnt < 0) {
15233 ctx.fmtpos--;
15234 ctx.writer.overallocate = 0;
15235 }
15236
15237 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15238 nonfmtpos, ctx.fmtpos) < 0)
15239 goto onError;
15240 }
15241 else {
15242 ctx.fmtpos++;
15243 if (unicode_format_arg(&ctx) == -1)
15244 goto onError;
15245 }
15246 }
15247
15248 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15249 PyErr_SetString(PyExc_TypeError,
15250 "not all arguments converted during string formatting");
15251 goto onError;
15252 }
15253
15254 if (ctx.args_owned) {
15255 Py_DECREF(ctx.args);
15256 }
15257 return _PyUnicodeWriter_Finish(&ctx.writer);
15258
15259 onError:
15260 _PyUnicodeWriter_Dealloc(&ctx.writer);
15261 if (ctx.args_owned) {
15262 Py_DECREF(ctx.args);
15263 }
15264 return NULL;
15265 }
15266
15267 static PyObject *
15268 unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15269
15270 /*[clinic input]
15271 @classmethod
15272 str.__new__ as unicode_new
15273
15274 object as x: object = NULL
15275 encoding: str = NULL
15276 errors: str = NULL
15277
15278 [clinic start generated code]*/
15279
15280 static PyObject *
unicode_new_impl(PyTypeObject * type,PyObject * x,const char * encoding,const char * errors)15281 unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15282 const char *errors)
15283 /*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15284 {
15285 PyObject *unicode;
15286 if (x == NULL) {
15287 unicode = unicode_new_empty();
15288 }
15289 else if (encoding == NULL && errors == NULL) {
15290 unicode = PyObject_Str(x);
15291 }
15292 else {
15293 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15294 }
15295
15296 if (unicode != NULL && type != &PyUnicode_Type) {
15297 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15298 }
15299 return unicode;
15300 }
15301
15302 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * unicode)15303 unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15304 {
15305 PyObject *self;
15306 Py_ssize_t length, char_size;
15307 int share_wstr, share_utf8;
15308 unsigned int kind;
15309 void *data;
15310
15311 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15312 assert(_PyUnicode_CHECK(unicode));
15313 if (PyUnicode_READY(unicode) == -1) {
15314 return NULL;
15315 }
15316
15317 self = type->tp_alloc(type, 0);
15318 if (self == NULL) {
15319 return NULL;
15320 }
15321 kind = PyUnicode_KIND(unicode);
15322 length = PyUnicode_GET_LENGTH(unicode);
15323
15324 _PyUnicode_LENGTH(self) = length;
15325 #ifdef Py_DEBUG
15326 _PyUnicode_HASH(self) = -1;
15327 #else
15328 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15329 #endif
15330 _PyUnicode_STATE(self).interned = 0;
15331 _PyUnicode_STATE(self).kind = kind;
15332 _PyUnicode_STATE(self).compact = 0;
15333 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15334 _PyUnicode_STATE(self).ready = 1;
15335 _PyUnicode_WSTR(self) = NULL;
15336 _PyUnicode_UTF8_LENGTH(self) = 0;
15337 _PyUnicode_UTF8(self) = NULL;
15338 _PyUnicode_WSTR_LENGTH(self) = 0;
15339 _PyUnicode_DATA_ANY(self) = NULL;
15340
15341 share_utf8 = 0;
15342 share_wstr = 0;
15343 if (kind == PyUnicode_1BYTE_KIND) {
15344 char_size = 1;
15345 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15346 share_utf8 = 1;
15347 }
15348 else if (kind == PyUnicode_2BYTE_KIND) {
15349 char_size = 2;
15350 if (sizeof(wchar_t) == 2)
15351 share_wstr = 1;
15352 }
15353 else {
15354 assert(kind == PyUnicode_4BYTE_KIND);
15355 char_size = 4;
15356 if (sizeof(wchar_t) == 4)
15357 share_wstr = 1;
15358 }
15359
15360 /* Ensure we won't overflow the length. */
15361 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15362 PyErr_NoMemory();
15363 goto onError;
15364 }
15365 data = PyObject_Malloc((length + 1) * char_size);
15366 if (data == NULL) {
15367 PyErr_NoMemory();
15368 goto onError;
15369 }
15370
15371 _PyUnicode_DATA_ANY(self) = data;
15372 if (share_utf8) {
15373 _PyUnicode_UTF8_LENGTH(self) = length;
15374 _PyUnicode_UTF8(self) = data;
15375 }
15376 if (share_wstr) {
15377 _PyUnicode_WSTR_LENGTH(self) = length;
15378 _PyUnicode_WSTR(self) = (wchar_t *)data;
15379 }
15380
15381 memcpy(data, PyUnicode_DATA(unicode),
15382 kind * (length + 1));
15383 assert(_PyUnicode_CheckConsistency(self, 1));
15384 #ifdef Py_DEBUG
15385 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15386 #endif
15387 return self;
15388
15389 onError:
15390 Py_DECREF(self);
15391 return NULL;
15392 }
15393
15394 void
_PyUnicode_ExactDealloc(PyObject * op)15395 _PyUnicode_ExactDealloc(PyObject *op)
15396 {
15397 assert(PyUnicode_CheckExact(op));
15398 unicode_dealloc(op);
15399 }
15400
15401 PyDoc_STRVAR(unicode_doc,
15402 "str(object='') -> str\n\
15403 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15404 \n\
15405 Create a new string object from the given object. If encoding or\n\
15406 errors is specified, then the object must expose a data buffer\n\
15407 that will be decoded using the given encoding and error handler.\n\
15408 Otherwise, returns the result of object.__str__() (if defined)\n\
15409 or repr(object).\n\
15410 encoding defaults to sys.getdefaultencoding().\n\
15411 errors defaults to 'strict'.");
15412
15413 static PyObject *unicode_iter(PyObject *seq);
15414
15415 PyTypeObject PyUnicode_Type = {
15416 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15417 "str", /* tp_name */
15418 sizeof(PyUnicodeObject), /* tp_basicsize */
15419 0, /* tp_itemsize */
15420 /* Slots */
15421 (destructor)unicode_dealloc, /* tp_dealloc */
15422 0, /* tp_vectorcall_offset */
15423 0, /* tp_getattr */
15424 0, /* tp_setattr */
15425 0, /* tp_as_async */
15426 unicode_repr, /* tp_repr */
15427 &unicode_as_number, /* tp_as_number */
15428 &unicode_as_sequence, /* tp_as_sequence */
15429 &unicode_as_mapping, /* tp_as_mapping */
15430 (hashfunc) unicode_hash, /* tp_hash*/
15431 0, /* tp_call*/
15432 (reprfunc) unicode_str, /* tp_str */
15433 PyObject_GenericGetAttr, /* tp_getattro */
15434 0, /* tp_setattro */
15435 0, /* tp_as_buffer */
15436 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15437 Py_TPFLAGS_UNICODE_SUBCLASS |
15438 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15439 unicode_doc, /* tp_doc */
15440 0, /* tp_traverse */
15441 0, /* tp_clear */
15442 PyUnicode_RichCompare, /* tp_richcompare */
15443 0, /* tp_weaklistoffset */
15444 unicode_iter, /* tp_iter */
15445 0, /* tp_iternext */
15446 unicode_methods, /* tp_methods */
15447 0, /* tp_members */
15448 0, /* tp_getset */
15449 0, /* tp_base */
15450 0, /* tp_dict */
15451 0, /* tp_descr_get */
15452 0, /* tp_descr_set */
15453 0, /* tp_dictoffset */
15454 0, /* tp_init */
15455 0, /* tp_alloc */
15456 unicode_new, /* tp_new */
15457 PyObject_Del, /* tp_free */
15458 };
15459
15460 /* Initialize the Unicode implementation */
15461
15462 void
_PyUnicode_InitState(PyInterpreterState * interp)15463 _PyUnicode_InitState(PyInterpreterState *interp)
15464 {
15465 if (!_Py_IsMainInterpreter(interp)) {
15466 return;
15467 }
15468
15469 /* initialize the linebreak bloom filter */
15470 const Py_UCS2 linebreak[] = {
15471 0x000A, /* LINE FEED */
15472 0x000D, /* CARRIAGE RETURN */
15473 0x001C, /* FILE SEPARATOR */
15474 0x001D, /* GROUP SEPARATOR */
15475 0x001E, /* RECORD SEPARATOR */
15476 0x0085, /* NEXT LINE */
15477 0x2028, /* LINE SEPARATOR */
15478 0x2029, /* PARAGRAPH SEPARATOR */
15479 };
15480 bloom_linebreak = make_bloom_mask(
15481 PyUnicode_2BYTE_KIND, linebreak,
15482 Py_ARRAY_LENGTH(linebreak));
15483 }
15484
15485
15486 PyStatus
_PyUnicode_InitGlobalObjects(PyInterpreterState * interp)15487 _PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15488 {
15489 if (!_Py_IsMainInterpreter(interp)) {
15490 return _PyStatus_OK();
15491 }
15492
15493 #ifdef Py_DEBUG
15494 assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
15495
15496 for (int i = 0; i < 256; i++) {
15497 assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
15498 }
15499 #endif
15500
15501 return _PyStatus_OK();
15502 }
15503
15504
15505 PyStatus
_PyUnicode_InitTypes(PyInterpreterState * interp)15506 _PyUnicode_InitTypes(PyInterpreterState *interp)
15507 {
15508 if (!_Py_IsMainInterpreter(interp)) {
15509 return _PyStatus_OK();
15510 }
15511
15512 if (PyType_Ready(&EncodingMapType) < 0) {
15513 goto error;
15514 }
15515 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15516 goto error;
15517 }
15518 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15519 goto error;
15520 }
15521 return _PyStatus_OK();
15522
15523 error:
15524 return _PyStatus_ERR("Can't initialize unicode types");
15525 }
15526
15527
15528 void
PyUnicode_InternInPlace(PyObject ** p)15529 PyUnicode_InternInPlace(PyObject **p)
15530 {
15531 PyObject *s = *p;
15532 #ifdef Py_DEBUG
15533 assert(s != NULL);
15534 assert(_PyUnicode_CHECK(s));
15535 #else
15536 if (s == NULL || !PyUnicode_Check(s)) {
15537 return;
15538 }
15539 #endif
15540
15541 /* If it's a subclass, we don't really know what putting
15542 it in the interned dict might do. */
15543 if (!PyUnicode_CheckExact(s)) {
15544 return;
15545 }
15546
15547 if (PyUnicode_CHECK_INTERNED(s)) {
15548 return;
15549 }
15550
15551 if (PyUnicode_READY(s) == -1) {
15552 PyErr_Clear();
15553 return;
15554 }
15555
15556 if (interned == NULL) {
15557 interned = PyDict_New();
15558 if (interned == NULL) {
15559 PyErr_Clear(); /* Don't leave an exception */
15560 return;
15561 }
15562 }
15563
15564 PyObject *t = PyDict_SetDefault(interned, s, s);
15565 if (t == NULL) {
15566 PyErr_Clear();
15567 return;
15568 }
15569
15570 if (t != s) {
15571 Py_INCREF(t);
15572 Py_SETREF(*p, t);
15573 return;
15574 }
15575
15576 /* The two references in interned dict (key and value) are not counted by
15577 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15578 this. */
15579 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
15580 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15581 }
15582
15583 void
PyUnicode_InternImmortal(PyObject ** p)15584 PyUnicode_InternImmortal(PyObject **p)
15585 {
15586 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15587 "PyUnicode_InternImmortal() is deprecated; "
15588 "use PyUnicode_InternInPlace() instead", 1) < 0)
15589 {
15590 // The function has no return value, the exception cannot
15591 // be reported to the caller, so just log it.
15592 PyErr_WriteUnraisable(NULL);
15593 }
15594
15595 PyUnicode_InternInPlace(p);
15596 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15597 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15598 Py_INCREF(*p);
15599 }
15600 }
15601
15602 PyObject *
PyUnicode_InternFromString(const char * cp)15603 PyUnicode_InternFromString(const char *cp)
15604 {
15605 PyObject *s = PyUnicode_FromString(cp);
15606 if (s == NULL)
15607 return NULL;
15608 PyUnicode_InternInPlace(&s);
15609 return s;
15610 }
15611
15612
15613 void
_PyUnicode_ClearInterned(PyInterpreterState * interp)15614 _PyUnicode_ClearInterned(PyInterpreterState *interp)
15615 {
15616 if (!_Py_IsMainInterpreter(interp)) {
15617 // interned dict is shared by all interpreters
15618 return;
15619 }
15620
15621 if (interned == NULL) {
15622 return;
15623 }
15624 assert(PyDict_CheckExact(interned));
15625
15626 /* Interned unicode strings are not forcibly deallocated; rather, we give
15627 them their stolen references back, and then clear and DECREF the
15628 interned dict. */
15629
15630 #ifdef INTERNED_STATS
15631 fprintf(stderr, "releasing %zd interned strings\n",
15632 PyDict_GET_SIZE(interned));
15633
15634 Py_ssize_t immortal_size = 0, mortal_size = 0;
15635 #endif
15636 Py_ssize_t pos = 0;
15637 PyObject *s, *ignored_value;
15638 while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15639 assert(PyUnicode_IS_READY(s));
15640
15641 switch (PyUnicode_CHECK_INTERNED(s)) {
15642 case SSTATE_INTERNED_IMMORTAL:
15643 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
15644 #ifdef INTERNED_STATS
15645 immortal_size += PyUnicode_GET_LENGTH(s);
15646 #endif
15647 break;
15648 case SSTATE_INTERNED_MORTAL:
15649 // Restore the two references (key and value) ignored
15650 // by PyUnicode_InternInPlace().
15651 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
15652 #ifdef INTERNED_STATS
15653 mortal_size += PyUnicode_GET_LENGTH(s);
15654 #endif
15655 break;
15656 case SSTATE_NOT_INTERNED:
15657 /* fall through */
15658 default:
15659 Py_UNREACHABLE();
15660 }
15661 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15662 }
15663 #ifdef INTERNED_STATS
15664 fprintf(stderr,
15665 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15666 mortal_size, immortal_size);
15667 #endif
15668
15669 PyDict_Clear(interned);
15670 Py_CLEAR(interned);
15671 }
15672
15673
15674 /********************* Unicode Iterator **************************/
15675
15676 typedef struct {
15677 PyObject_HEAD
15678 Py_ssize_t it_index;
15679 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15680 } unicodeiterobject;
15681
15682 static void
unicodeiter_dealloc(unicodeiterobject * it)15683 unicodeiter_dealloc(unicodeiterobject *it)
15684 {
15685 _PyObject_GC_UNTRACK(it);
15686 Py_XDECREF(it->it_seq);
15687 PyObject_GC_Del(it);
15688 }
15689
15690 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15691 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15692 {
15693 Py_VISIT(it->it_seq);
15694 return 0;
15695 }
15696
15697 static PyObject *
unicodeiter_next(unicodeiterobject * it)15698 unicodeiter_next(unicodeiterobject *it)
15699 {
15700 PyObject *seq;
15701
15702 assert(it != NULL);
15703 seq = it->it_seq;
15704 if (seq == NULL)
15705 return NULL;
15706 assert(_PyUnicode_CHECK(seq));
15707
15708 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15709 int kind = PyUnicode_KIND(seq);
15710 const void *data = PyUnicode_DATA(seq);
15711 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15712 it->it_index++;
15713 return unicode_char(chr);
15714 }
15715
15716 it->it_seq = NULL;
15717 Py_DECREF(seq);
15718 return NULL;
15719 }
15720
15721 static PyObject *
unicode_ascii_iter_next(unicodeiterobject * it)15722 unicode_ascii_iter_next(unicodeiterobject *it)
15723 {
15724 assert(it != NULL);
15725 PyObject *seq = it->it_seq;
15726 if (seq == NULL) {
15727 return NULL;
15728 }
15729 assert(_PyUnicode_CHECK(seq));
15730 assert(PyUnicode_IS_COMPACT_ASCII(seq));
15731 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15732 const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
15733 Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
15734 data, it->it_index);
15735 it->it_index++;
15736 PyObject *item = (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
15737 return Py_NewRef(item);
15738 }
15739 it->it_seq = NULL;
15740 Py_DECREF(seq);
15741 return NULL;
15742 }
15743
15744 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15745 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15746 {
15747 Py_ssize_t len = 0;
15748 if (it->it_seq)
15749 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15750 return PyLong_FromSsize_t(len);
15751 }
15752
15753 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15754
15755 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15756 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15757 {
15758 PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
15759
15760 /* _PyEval_GetBuiltin can invoke arbitrary code,
15761 * call must be before access of iterator pointers.
15762 * see issue #101765 */
15763
15764 if (it->it_seq != NULL) {
15765 return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
15766 } else {
15767 PyObject *u = (PyObject *)_PyUnicode_New(0);
15768 if (u == NULL) {
15769 Py_XDECREF(iter);
15770 return NULL;
15771 }
15772 return Py_BuildValue("N(N)", iter, u);
15773 }
15774 }
15775
15776 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15777
15778 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15779 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15780 {
15781 Py_ssize_t index = PyLong_AsSsize_t(state);
15782 if (index == -1 && PyErr_Occurred())
15783 return NULL;
15784 if (it->it_seq != NULL) {
15785 if (index < 0)
15786 index = 0;
15787 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15788 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15789 it->it_index = index;
15790 }
15791 Py_RETURN_NONE;
15792 }
15793
15794 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15795
15796 static PyMethodDef unicodeiter_methods[] = {
15797 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15798 length_hint_doc},
15799 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15800 reduce_doc},
15801 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15802 setstate_doc},
15803 {NULL, NULL} /* sentinel */
15804 };
15805
15806 PyTypeObject PyUnicodeIter_Type = {
15807 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15808 "str_iterator", /* tp_name */
15809 sizeof(unicodeiterobject), /* tp_basicsize */
15810 0, /* tp_itemsize */
15811 /* methods */
15812 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15813 0, /* tp_vectorcall_offset */
15814 0, /* tp_getattr */
15815 0, /* tp_setattr */
15816 0, /* tp_as_async */
15817 0, /* tp_repr */
15818 0, /* tp_as_number */
15819 0, /* tp_as_sequence */
15820 0, /* tp_as_mapping */
15821 0, /* tp_hash */
15822 0, /* tp_call */
15823 0, /* tp_str */
15824 PyObject_GenericGetAttr, /* tp_getattro */
15825 0, /* tp_setattro */
15826 0, /* tp_as_buffer */
15827 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15828 0, /* tp_doc */
15829 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15830 0, /* tp_clear */
15831 0, /* tp_richcompare */
15832 0, /* tp_weaklistoffset */
15833 PyObject_SelfIter, /* tp_iter */
15834 (iternextfunc)unicodeiter_next, /* tp_iternext */
15835 unicodeiter_methods, /* tp_methods */
15836 0,
15837 };
15838
15839 PyTypeObject _PyUnicodeASCIIIter_Type = {
15840 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15841 .tp_name = "str_ascii_iterator",
15842 .tp_basicsize = sizeof(unicodeiterobject),
15843 .tp_dealloc = (destructor)unicodeiter_dealloc,
15844 .tp_getattro = PyObject_GenericGetAttr,
15845 .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
15846 .tp_traverse = (traverseproc)unicodeiter_traverse,
15847 .tp_iter = PyObject_SelfIter,
15848 .tp_iternext = (iternextfunc)unicode_ascii_iter_next,
15849 .tp_methods = unicodeiter_methods,
15850 };
15851
15852 static PyObject *
unicode_iter(PyObject * seq)15853 unicode_iter(PyObject *seq)
15854 {
15855 unicodeiterobject *it;
15856
15857 if (!PyUnicode_Check(seq)) {
15858 PyErr_BadInternalCall();
15859 return NULL;
15860 }
15861 if (PyUnicode_READY(seq) == -1)
15862 return NULL;
15863 if (PyUnicode_IS_COMPACT_ASCII(seq)) {
15864 it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
15865 }
15866 else {
15867 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15868 }
15869 if (it == NULL)
15870 return NULL;
15871 it->it_index = 0;
15872 Py_INCREF(seq);
15873 it->it_seq = seq;
15874 _PyObject_GC_TRACK(it);
15875 return (PyObject *)it;
15876 }
15877
15878 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15879 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15880 {
15881 int res;
15882 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15883 if (res == -2) {
15884 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15885 return -1;
15886 }
15887 if (res < 0) {
15888 PyErr_NoMemory();
15889 return -1;
15890 }
15891 return 0;
15892 }
15893
15894
15895 static int
config_get_codec_name(wchar_t ** config_encoding)15896 config_get_codec_name(wchar_t **config_encoding)
15897 {
15898 char *encoding;
15899 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15900 return -1;
15901 }
15902
15903 PyObject *name_obj = NULL;
15904 PyObject *codec = _PyCodec_Lookup(encoding);
15905 PyMem_RawFree(encoding);
15906
15907 if (!codec)
15908 goto error;
15909
15910 name_obj = PyObject_GetAttrString(codec, "name");
15911 Py_CLEAR(codec);
15912 if (!name_obj) {
15913 goto error;
15914 }
15915
15916 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15917 Py_DECREF(name_obj);
15918 if (wname == NULL) {
15919 goto error;
15920 }
15921
15922 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15923 if (raw_wname == NULL) {
15924 PyMem_Free(wname);
15925 PyErr_NoMemory();
15926 goto error;
15927 }
15928
15929 PyMem_RawFree(*config_encoding);
15930 *config_encoding = raw_wname;
15931
15932 PyMem_Free(wname);
15933 return 0;
15934
15935 error:
15936 Py_XDECREF(codec);
15937 Py_XDECREF(name_obj);
15938 return -1;
15939 }
15940
15941
15942 static PyStatus
init_stdio_encoding(PyInterpreterState * interp)15943 init_stdio_encoding(PyInterpreterState *interp)
15944 {
15945 /* Update the stdio encoding to the normalized Python codec name. */
15946 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
15947 if (config_get_codec_name(&config->stdio_encoding) < 0) {
15948 return _PyStatus_ERR("failed to get the Python codec name "
15949 "of the stdio encoding");
15950 }
15951 return _PyStatus_OK();
15952 }
15953
15954
15955 static int
init_fs_codec(PyInterpreterState * interp)15956 init_fs_codec(PyInterpreterState *interp)
15957 {
15958 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
15959
15960 _Py_error_handler error_handler;
15961 error_handler = get_error_handler_wide(config->filesystem_errors);
15962 if (error_handler == _Py_ERROR_UNKNOWN) {
15963 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
15964 return -1;
15965 }
15966
15967 char *encoding, *errors;
15968 if (encode_wstr_utf8(config->filesystem_encoding,
15969 &encoding,
15970 "filesystem_encoding") < 0) {
15971 return -1;
15972 }
15973
15974 if (encode_wstr_utf8(config->filesystem_errors,
15975 &errors,
15976 "filesystem_errors") < 0) {
15977 PyMem_RawFree(encoding);
15978 return -1;
15979 }
15980
15981 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
15982 PyMem_RawFree(fs_codec->encoding);
15983 fs_codec->encoding = encoding;
15984 /* encoding has been normalized by init_fs_encoding() */
15985 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
15986 PyMem_RawFree(fs_codec->errors);
15987 fs_codec->errors = errors;
15988 fs_codec->error_handler = error_handler;
15989
15990 #ifdef _Py_FORCE_UTF8_FS_ENCODING
15991 assert(fs_codec->utf8 == 1);
15992 #endif
15993
15994 /* At this point, PyUnicode_EncodeFSDefault() and
15995 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15996 the C implementation of the filesystem encoding. */
15997
15998 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15999 global configuration variables. */
16000 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16001 fs_codec->errors) < 0) {
16002 PyErr_NoMemory();
16003 return -1;
16004 }
16005 return 0;
16006 }
16007
16008
16009 static PyStatus
init_fs_encoding(PyThreadState * tstate)16010 init_fs_encoding(PyThreadState *tstate)
16011 {
16012 PyInterpreterState *interp = tstate->interp;
16013
16014 /* Update the filesystem encoding to the normalized Python codec name.
16015 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16016 (Python codec name). */
16017 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16018 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16019 _Py_DumpPathConfig(tstate);
16020 return _PyStatus_ERR("failed to get the Python codec "
16021 "of the filesystem encoding");
16022 }
16023
16024 if (init_fs_codec(interp) < 0) {
16025 return _PyStatus_ERR("cannot initialize filesystem codec");
16026 }
16027 return _PyStatus_OK();
16028 }
16029
16030
16031 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)16032 _PyUnicode_InitEncodings(PyThreadState *tstate)
16033 {
16034 PyStatus status = init_fs_encoding(tstate);
16035 if (_PyStatus_EXCEPTION(status)) {
16036 return status;
16037 }
16038
16039 return init_stdio_encoding(tstate->interp);
16040 }
16041
16042
16043 static void
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec * fs_codec)16044 _PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16045 {
16046 PyMem_RawFree(fs_codec->encoding);
16047 fs_codec->encoding = NULL;
16048 fs_codec->utf8 = 0;
16049 PyMem_RawFree(fs_codec->errors);
16050 fs_codec->errors = NULL;
16051 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16052 }
16053
16054
16055 #ifdef MS_WINDOWS
16056 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)16057 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
16058 {
16059 PyInterpreterState *interp = _PyInterpreterState_GET();
16060 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16061
16062 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16063 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16064 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16065 if (encoding == NULL || errors == NULL) {
16066 PyMem_RawFree(encoding);
16067 PyMem_RawFree(errors);
16068 PyErr_NoMemory();
16069 return -1;
16070 }
16071
16072 PyMem_RawFree(config->filesystem_encoding);
16073 config->filesystem_encoding = encoding;
16074 PyMem_RawFree(config->filesystem_errors);
16075 config->filesystem_errors = errors;
16076
16077 return init_fs_codec(interp);
16078 }
16079 #endif
16080
16081
16082 #ifdef Py_DEBUG
16083 static inline int
unicode_is_finalizing(void)16084 unicode_is_finalizing(void)
16085 {
16086 return (interned == NULL);
16087 }
16088 #endif
16089
16090
16091 void
_PyUnicode_FiniTypes(PyInterpreterState * interp)16092 _PyUnicode_FiniTypes(PyInterpreterState *interp)
16093 {
16094 if (!_Py_IsMainInterpreter(interp)) {
16095 return;
16096 }
16097
16098 _PyStaticType_Dealloc(&EncodingMapType);
16099 _PyStaticType_Dealloc(&PyFieldNameIter_Type);
16100 _PyStaticType_Dealloc(&PyFormatterIter_Type);
16101 }
16102
16103
unicode_static_dealloc(PyObject * op)16104 static void unicode_static_dealloc(PyObject *op)
16105 {
16106 PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
16107
16108 assert(ascii->state.compact);
16109
16110 if (ascii->state.ascii) {
16111 if (ascii->wstr) {
16112 PyObject_Free(ascii->wstr);
16113 ascii->wstr = NULL;
16114 }
16115 }
16116 else {
16117 PyCompactUnicodeObject* compact = (PyCompactUnicodeObject*)op;
16118 void* data = (void*)(compact + 1);
16119 if (ascii->wstr && ascii->wstr != data) {
16120 PyObject_Free(ascii->wstr);
16121 ascii->wstr = NULL;
16122 compact->wstr_length = 0;
16123 }
16124 if (compact->utf8) {
16125 PyObject_Free(compact->utf8);
16126 compact->utf8 = NULL;
16127 compact->utf8_length = 0;
16128 }
16129 }
16130 }
16131
16132
16133 void
_PyUnicode_Fini(PyInterpreterState * interp)16134 _PyUnicode_Fini(PyInterpreterState *interp)
16135 {
16136 struct _Py_unicode_state *state = &interp->unicode;
16137
16138 if (_Py_IsMainInterpreter(interp)) {
16139 // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16140 assert(interned == NULL);
16141 // bpo-47182: force a unicodedata CAPI capsule re-import on
16142 // subsequent initialization of main interpreter.
16143 ucnhash_capi = NULL;
16144 }
16145
16146 _PyUnicode_FiniEncodings(&state->fs_codec);
16147
16148 unicode_clear_identifiers(state);
16149
16150 // Clear the single character singletons
16151 for (int i = 0; i < 128; i++) {
16152 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).ascii[i]);
16153 }
16154 for (int i = 0; i < 128; i++) {
16155 unicode_static_dealloc((PyObject*)&_Py_SINGLETON(strings).latin1[i]);
16156 }
16157 }
16158
16159
16160 void
_PyStaticUnicode_Dealloc(PyObject * op)16161 _PyStaticUnicode_Dealloc(PyObject *op)
16162 {
16163 unicode_static_dealloc(op);
16164 }
16165
16166
16167 /* A _string module, to export formatter_parser and formatter_field_name_split
16168 to the string.Formatter class implemented in Python. */
16169
16170 static PyMethodDef _string_methods[] = {
16171 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16172 METH_O, PyDoc_STR("split the argument as a field name")},
16173 {"formatter_parser", (PyCFunction) formatter_parser,
16174 METH_O, PyDoc_STR("parse the argument as a format string")},
16175 {NULL, NULL}
16176 };
16177
16178 static struct PyModuleDef _string_module = {
16179 PyModuleDef_HEAD_INIT,
16180 .m_name = "_string",
16181 .m_doc = PyDoc_STR("string helper module"),
16182 .m_size = 0,
16183 .m_methods = _string_methods,
16184 };
16185
16186 PyMODINIT_FUNC
PyInit__string(void)16187 PyInit__string(void)
16188 {
16189 return PyModuleDef_Init(&_string_module);
16190 }
16191
16192
16193 #ifdef __cplusplus
16194 }
16195 #endif
16196