1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg ([email protected]).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "pycore_call.h" // _PyObject_CallNoArgs()
13 #include "pycore_interp.h" // PyInterpreterState.codec_search_path
14 #include "pycore_pystate.h" // _PyInterpreterState_GET()
15 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
16 #include <ctype.h>
17
18 const char *Py_hexdigits = "0123456789abcdef";
19
20 /* --- Codec Registry ----------------------------------------------------- */
21
22 /* Import the standard encodings package which will register the first
23 codec search function.
24
25 This is done in a lazy way so that the Unicode implementation does
26 not downgrade startup time of scripts not needing it.
27
28 ImportErrors are silently ignored by this function. Only one try is
29 made.
30
31 */
32
33 static int _PyCodecRegistry_Init(void); /* Forward */
34
PyCodec_Register(PyObject * search_function)35 int PyCodec_Register(PyObject *search_function)
36 {
37 PyInterpreterState *interp = _PyInterpreterState_GET();
38 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
39 goto onError;
40 if (search_function == NULL) {
41 PyErr_BadArgument();
42 goto onError;
43 }
44 if (!PyCallable_Check(search_function)) {
45 PyErr_SetString(PyExc_TypeError, "argument must be callable");
46 goto onError;
47 }
48 return PyList_Append(interp->codec_search_path, search_function);
49
50 onError:
51 return -1;
52 }
53
54 int
PyCodec_Unregister(PyObject * search_function)55 PyCodec_Unregister(PyObject *search_function)
56 {
57 PyInterpreterState *interp = PyInterpreterState_Get();
58 PyObject *codec_search_path = interp->codec_search_path;
59 /* Do nothing if codec_search_path is not created yet or was cleared. */
60 if (codec_search_path == NULL) {
61 return 0;
62 }
63
64 assert(PyList_CheckExact(codec_search_path));
65 Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
66 for (Py_ssize_t i = 0; i < n; i++) {
67 PyObject *item = PyList_GET_ITEM(codec_search_path, i);
68 if (item == search_function) {
69 if (interp->codec_search_cache != NULL) {
70 assert(PyDict_CheckExact(interp->codec_search_cache));
71 PyDict_Clear(interp->codec_search_cache);
72 }
73 return PyList_SetSlice(codec_search_path, i, i+1, NULL);
74 }
75 }
76 return 0;
77 }
78
79 extern int _Py_normalize_encoding(const char *, char *, size_t);
80
81 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
82 converted to lower case, spaces and hyphens are replaced with underscores. */
83
84 static
normalizestring(const char * string)85 PyObject *normalizestring(const char *string)
86 {
87 size_t len = strlen(string);
88 char *encoding;
89 PyObject *v;
90
91 if (len > PY_SSIZE_T_MAX) {
92 PyErr_SetString(PyExc_OverflowError, "string is too large");
93 return NULL;
94 }
95
96 encoding = PyMem_Malloc(len + 1);
97 if (encoding == NULL)
98 return PyErr_NoMemory();
99
100 if (!_Py_normalize_encoding(string, encoding, len + 1))
101 {
102 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
103 PyMem_Free(encoding);
104 return NULL;
105 }
106
107 v = PyUnicode_FromString(encoding);
108 PyMem_Free(encoding);
109 return v;
110 }
111
112 /* Lookup the given encoding and return a tuple providing the codec
113 facilities.
114
115 The encoding string is looked up converted to all lower-case
116 characters. This makes encodings looked up through this mechanism
117 effectively case-insensitive.
118
119 If no codec is found, a LookupError is set and NULL returned.
120
121 As side effect, this tries to load the encodings package, if not
122 yet done. This is part of the lazy load strategy for the encodings
123 package.
124
125 */
126
_PyCodec_Lookup(const char * encoding)127 PyObject *_PyCodec_Lookup(const char *encoding)
128 {
129 if (encoding == NULL) {
130 PyErr_BadArgument();
131 return NULL;
132 }
133
134 PyInterpreterState *interp = _PyInterpreterState_GET();
135 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
136 return NULL;
137 }
138
139 /* Convert the encoding to a normalized Python string: all
140 characters are converted to lower case, spaces and hyphens are
141 replaced with underscores. */
142 PyObject *v = normalizestring(encoding);
143 if (v == NULL) {
144 return NULL;
145 }
146 PyUnicode_InternInPlace(&v);
147
148 /* First, try to lookup the name in the registry dictionary */
149 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
150 if (result != NULL) {
151 Py_INCREF(result);
152 Py_DECREF(v);
153 return result;
154 }
155 else if (PyErr_Occurred()) {
156 goto onError;
157 }
158
159 /* Next, scan the search functions in order of registration */
160 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
161 if (len < 0)
162 goto onError;
163 if (len == 0) {
164 PyErr_SetString(PyExc_LookupError,
165 "no codec search functions registered: "
166 "can't find encoding");
167 goto onError;
168 }
169
170 Py_ssize_t i;
171 for (i = 0; i < len; i++) {
172 PyObject *func;
173
174 func = PyList_GetItem(interp->codec_search_path, i);
175 if (func == NULL)
176 goto onError;
177 result = PyObject_CallOneArg(func, v);
178 if (result == NULL)
179 goto onError;
180 if (result == Py_None) {
181 Py_DECREF(result);
182 continue;
183 }
184 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
185 PyErr_SetString(PyExc_TypeError,
186 "codec search functions must return 4-tuples");
187 Py_DECREF(result);
188 goto onError;
189 }
190 break;
191 }
192 if (i == len) {
193 /* XXX Perhaps we should cache misses too ? */
194 PyErr_Format(PyExc_LookupError,
195 "unknown encoding: %s", encoding);
196 goto onError;
197 }
198
199 /* Cache and return the result */
200 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
201 Py_DECREF(result);
202 goto onError;
203 }
204 Py_DECREF(v);
205 return result;
206
207 onError:
208 Py_DECREF(v);
209 return NULL;
210 }
211
212 /* Codec registry encoding check API. */
213
PyCodec_KnownEncoding(const char * encoding)214 int PyCodec_KnownEncoding(const char *encoding)
215 {
216 PyObject *codecs;
217
218 codecs = _PyCodec_Lookup(encoding);
219 if (!codecs) {
220 PyErr_Clear();
221 return 0;
222 }
223 else {
224 Py_DECREF(codecs);
225 return 1;
226 }
227 }
228
229 static
args_tuple(PyObject * object,const char * errors)230 PyObject *args_tuple(PyObject *object,
231 const char *errors)
232 {
233 PyObject *args;
234
235 args = PyTuple_New(1 + (errors != NULL));
236 if (args == NULL)
237 return NULL;
238 Py_INCREF(object);
239 PyTuple_SET_ITEM(args,0,object);
240 if (errors) {
241 PyObject *v;
242
243 v = PyUnicode_FromString(errors);
244 if (v == NULL) {
245 Py_DECREF(args);
246 return NULL;
247 }
248 PyTuple_SET_ITEM(args, 1, v);
249 }
250 return args;
251 }
252
253 /* Helper function to get a codec item */
254
255 static
codec_getitem(const char * encoding,int index)256 PyObject *codec_getitem(const char *encoding, int index)
257 {
258 PyObject *codecs;
259 PyObject *v;
260
261 codecs = _PyCodec_Lookup(encoding);
262 if (codecs == NULL)
263 return NULL;
264 v = PyTuple_GET_ITEM(codecs, index);
265 Py_DECREF(codecs);
266 Py_INCREF(v);
267 return v;
268 }
269
270 /* Helper functions to create an incremental codec. */
271 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)272 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
273 const char *errors,
274 const char *attrname)
275 {
276 PyObject *ret, *inccodec;
277
278 inccodec = PyObject_GetAttrString(codec_info, attrname);
279 if (inccodec == NULL)
280 return NULL;
281 if (errors)
282 ret = PyObject_CallFunction(inccodec, "s", errors);
283 else
284 ret = _PyObject_CallNoArgs(inccodec);
285 Py_DECREF(inccodec);
286 return ret;
287 }
288
289 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)290 PyObject *codec_getincrementalcodec(const char *encoding,
291 const char *errors,
292 const char *attrname)
293 {
294 PyObject *codec_info, *ret;
295
296 codec_info = _PyCodec_Lookup(encoding);
297 if (codec_info == NULL)
298 return NULL;
299 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
300 Py_DECREF(codec_info);
301 return ret;
302 }
303
304 /* Helper function to create a stream codec. */
305
306 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)307 PyObject *codec_getstreamcodec(const char *encoding,
308 PyObject *stream,
309 const char *errors,
310 const int index)
311 {
312 PyObject *codecs, *streamcodec, *codeccls;
313
314 codecs = _PyCodec_Lookup(encoding);
315 if (codecs == NULL)
316 return NULL;
317
318 codeccls = PyTuple_GET_ITEM(codecs, index);
319 if (errors != NULL)
320 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
321 else
322 streamcodec = PyObject_CallOneArg(codeccls, stream);
323 Py_DECREF(codecs);
324 return streamcodec;
325 }
326
327 /* Helpers to work with the result of _PyCodec_Lookup
328
329 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)330 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
331 const char *errors)
332 {
333 return codec_makeincrementalcodec(codec_info, errors,
334 "incrementaldecoder");
335 }
336
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)337 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
338 const char *errors)
339 {
340 return codec_makeincrementalcodec(codec_info, errors,
341 "incrementalencoder");
342 }
343
344
345 /* Convenience APIs to query the Codec registry.
346
347 All APIs return a codec object with incremented refcount.
348
349 */
350
PyCodec_Encoder(const char * encoding)351 PyObject *PyCodec_Encoder(const char *encoding)
352 {
353 return codec_getitem(encoding, 0);
354 }
355
PyCodec_Decoder(const char * encoding)356 PyObject *PyCodec_Decoder(const char *encoding)
357 {
358 return codec_getitem(encoding, 1);
359 }
360
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)361 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
362 const char *errors)
363 {
364 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
365 }
366
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
368 const char *errors)
369 {
370 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
371 }
372
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)373 PyObject *PyCodec_StreamReader(const char *encoding,
374 PyObject *stream,
375 const char *errors)
376 {
377 return codec_getstreamcodec(encoding, stream, errors, 2);
378 }
379
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)380 PyObject *PyCodec_StreamWriter(const char *encoding,
381 PyObject *stream,
382 const char *errors)
383 {
384 return codec_getstreamcodec(encoding, stream, errors, 3);
385 }
386
387 /* Helper that tries to ensure the reported exception chain indicates the
388 * codec that was invoked to trigger the failure without changing the type
389 * of the exception raised.
390 */
391 static void
wrap_codec_error(const char * operation,const char * encoding)392 wrap_codec_error(const char *operation,
393 const char *encoding)
394 {
395 /* TrySetFromCause will replace the active exception with a suitably
396 * updated clone if it can, otherwise it will leave the original
397 * exception alone.
398 */
399 _PyErr_TrySetFromCause("%s with '%s' codec failed",
400 operation, encoding);
401 }
402
403 /* Encode an object (e.g. a Unicode object) using the given encoding
404 and return the resulting encoded object (usually a Python string).
405
406 errors is passed to the encoder factory as argument if non-NULL. */
407
408 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)409 _PyCodec_EncodeInternal(PyObject *object,
410 PyObject *encoder,
411 const char *encoding,
412 const char *errors)
413 {
414 PyObject *args = NULL, *result = NULL;
415 PyObject *v = NULL;
416
417 args = args_tuple(object, errors);
418 if (args == NULL)
419 goto onError;
420
421 result = PyObject_Call(encoder, args, NULL);
422 if (result == NULL) {
423 wrap_codec_error("encoding", encoding);
424 goto onError;
425 }
426
427 if (!PyTuple_Check(result) ||
428 PyTuple_GET_SIZE(result) != 2) {
429 PyErr_SetString(PyExc_TypeError,
430 "encoder must return a tuple (object, integer)");
431 goto onError;
432 }
433 v = PyTuple_GET_ITEM(result,0);
434 Py_INCREF(v);
435 /* We don't check or use the second (integer) entry. */
436
437 Py_DECREF(args);
438 Py_DECREF(encoder);
439 Py_DECREF(result);
440 return v;
441
442 onError:
443 Py_XDECREF(result);
444 Py_XDECREF(args);
445 Py_XDECREF(encoder);
446 return NULL;
447 }
448
449 /* Decode an object (usually a Python string) using the given encoding
450 and return an equivalent object (e.g. a Unicode object).
451
452 errors is passed to the decoder factory as argument if non-NULL. */
453
454 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)455 _PyCodec_DecodeInternal(PyObject *object,
456 PyObject *decoder,
457 const char *encoding,
458 const char *errors)
459 {
460 PyObject *args = NULL, *result = NULL;
461 PyObject *v;
462
463 args = args_tuple(object, errors);
464 if (args == NULL)
465 goto onError;
466
467 result = PyObject_Call(decoder, args, NULL);
468 if (result == NULL) {
469 wrap_codec_error("decoding", encoding);
470 goto onError;
471 }
472 if (!PyTuple_Check(result) ||
473 PyTuple_GET_SIZE(result) != 2) {
474 PyErr_SetString(PyExc_TypeError,
475 "decoder must return a tuple (object,integer)");
476 goto onError;
477 }
478 v = PyTuple_GET_ITEM(result,0);
479 Py_INCREF(v);
480 /* We don't check or use the second (integer) entry. */
481
482 Py_DECREF(args);
483 Py_DECREF(decoder);
484 Py_DECREF(result);
485 return v;
486
487 onError:
488 Py_XDECREF(args);
489 Py_XDECREF(decoder);
490 Py_XDECREF(result);
491 return NULL;
492 }
493
494 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)495 PyObject *PyCodec_Encode(PyObject *object,
496 const char *encoding,
497 const char *errors)
498 {
499 PyObject *encoder;
500
501 encoder = PyCodec_Encoder(encoding);
502 if (encoder == NULL)
503 return NULL;
504
505 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
506 }
507
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)508 PyObject *PyCodec_Decode(PyObject *object,
509 const char *encoding,
510 const char *errors)
511 {
512 PyObject *decoder;
513
514 decoder = PyCodec_Decoder(encoding);
515 if (decoder == NULL)
516 return NULL;
517
518 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
519 }
520
521 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)522 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
523 const char *alternate_command)
524 {
525 PyObject *codec;
526 PyObject *attr;
527 int is_text_codec;
528
529 codec = _PyCodec_Lookup(encoding);
530 if (codec == NULL)
531 return NULL;
532
533 /* Backwards compatibility: assume any raw tuple describes a text
534 * encoding, and the same for anything lacking the private
535 * attribute.
536 */
537 if (!PyTuple_CheckExact(codec)) {
538 if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
539 Py_DECREF(codec);
540 return NULL;
541 }
542 if (attr != NULL) {
543 is_text_codec = PyObject_IsTrue(attr);
544 Py_DECREF(attr);
545 if (is_text_codec <= 0) {
546 Py_DECREF(codec);
547 if (!is_text_codec)
548 PyErr_Format(PyExc_LookupError,
549 "'%.400s' is not a text encoding; "
550 "use %s to handle arbitrary codecs",
551 encoding, alternate_command);
552 return NULL;
553 }
554 }
555 }
556
557 /* This appears to be a valid text encoding */
558 return codec;
559 }
560
561
562 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)563 PyObject *codec_getitem_checked(const char *encoding,
564 const char *alternate_command,
565 int index)
566 {
567 PyObject *codec;
568 PyObject *v;
569
570 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571 if (codec == NULL)
572 return NULL;
573
574 v = PyTuple_GET_ITEM(codec, index);
575 Py_INCREF(v);
576 Py_DECREF(codec);
577 return v;
578 }
579
_PyCodec_TextEncoder(const char * encoding)580 static PyObject * _PyCodec_TextEncoder(const char *encoding)
581 {
582 return codec_getitem_checked(encoding, "codecs.encode()", 0);
583 }
584
_PyCodec_TextDecoder(const char * encoding)585 static PyObject * _PyCodec_TextDecoder(const char *encoding)
586 {
587 return codec_getitem_checked(encoding, "codecs.decode()", 1);
588 }
589
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_EncodeText(PyObject *object,
591 const char *encoding,
592 const char *errors)
593 {
594 PyObject *encoder;
595
596 encoder = _PyCodec_TextEncoder(encoding);
597 if (encoder == NULL)
598 return NULL;
599
600 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601 }
602
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)603 PyObject *_PyCodec_DecodeText(PyObject *object,
604 const char *encoding,
605 const char *errors)
606 {
607 PyObject *decoder;
608
609 decoder = _PyCodec_TextDecoder(encoding);
610 if (decoder == NULL)
611 return NULL;
612
613 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614 }
615
616 /* Register the error handling callback function error under the name
617 name. This function will be called by the codec when it encounters
618 an unencodable characters/undecodable bytes and doesn't know the
619 callback name, when name is specified as the error parameter
620 in the call to the encode/decode function.
621 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)622 int PyCodec_RegisterError(const char *name, PyObject *error)
623 {
624 PyInterpreterState *interp = _PyInterpreterState_GET();
625 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626 return -1;
627 if (!PyCallable_Check(error)) {
628 PyErr_SetString(PyExc_TypeError, "handler must be callable");
629 return -1;
630 }
631 return PyDict_SetItemString(interp->codec_error_registry,
632 name, error);
633 }
634
635 /* Lookup the error handling callback function registered under the
636 name error. As a special case NULL can be passed, in which case
637 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)638 PyObject *PyCodec_LookupError(const char *name)
639 {
640 PyObject *handler = NULL;
641
642 PyInterpreterState *interp = _PyInterpreterState_GET();
643 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644 return NULL;
645
646 if (name==NULL)
647 name = "strict";
648 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649 if (handler) {
650 Py_INCREF(handler);
651 }
652 else if (!PyErr_Occurred()) {
653 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654 }
655 return handler;
656 }
657
wrong_exception_type(PyObject * exc)658 static void wrong_exception_type(PyObject *exc)
659 {
660 PyErr_Format(PyExc_TypeError,
661 "don't know how to handle %.200s in error callback",
662 Py_TYPE(exc)->tp_name);
663 }
664
PyCodec_StrictErrors(PyObject * exc)665 PyObject *PyCodec_StrictErrors(PyObject *exc)
666 {
667 if (PyExceptionInstance_Check(exc))
668 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669 else
670 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671 return NULL;
672 }
673
674
PyCodec_IgnoreErrors(PyObject * exc)675 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676 {
677 Py_ssize_t end;
678
679 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680 if (PyUnicodeEncodeError_GetEnd(exc, &end))
681 return NULL;
682 }
683 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684 if (PyUnicodeDecodeError_GetEnd(exc, &end))
685 return NULL;
686 }
687 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688 if (PyUnicodeTranslateError_GetEnd(exc, &end))
689 return NULL;
690 }
691 else {
692 wrong_exception_type(exc);
693 return NULL;
694 }
695 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696 }
697
698
PyCodec_ReplaceErrors(PyObject * exc)699 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700 {
701 Py_ssize_t start, end, i, len;
702
703 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704 PyObject *res;
705 Py_UCS1 *outp;
706 if (PyUnicodeEncodeError_GetStart(exc, &start))
707 return NULL;
708 if (PyUnicodeEncodeError_GetEnd(exc, &end))
709 return NULL;
710 len = end - start;
711 res = PyUnicode_New(len, '?');
712 if (res == NULL)
713 return NULL;
714 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715 outp = PyUnicode_1BYTE_DATA(res);
716 for (i = 0; i < len; ++i)
717 outp[i] = '?';
718 assert(_PyUnicode_CheckConsistency(res, 1));
719 return Py_BuildValue("(Nn)", res, end);
720 }
721 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722 if (PyUnicodeDecodeError_GetEnd(exc, &end))
723 return NULL;
724 return Py_BuildValue("(Cn)",
725 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726 end);
727 }
728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729 PyObject *res;
730 Py_UCS2 *outp;
731 if (PyUnicodeTranslateError_GetStart(exc, &start))
732 return NULL;
733 if (PyUnicodeTranslateError_GetEnd(exc, &end))
734 return NULL;
735 len = end - start;
736 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737 if (res == NULL)
738 return NULL;
739 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740 outp = PyUnicode_2BYTE_DATA(res);
741 for (i = 0; i < len; i++)
742 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743 assert(_PyUnicode_CheckConsistency(res, 1));
744 return Py_BuildValue("(Nn)", res, end);
745 }
746 else {
747 wrong_exception_type(exc);
748 return NULL;
749 }
750 }
751
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)752 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753 {
754 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755 PyObject *restuple;
756 PyObject *object;
757 Py_ssize_t i;
758 Py_ssize_t start;
759 Py_ssize_t end;
760 PyObject *res;
761 Py_UCS1 *outp;
762 Py_ssize_t ressize;
763 Py_UCS4 ch;
764 if (PyUnicodeEncodeError_GetStart(exc, &start))
765 return NULL;
766 if (PyUnicodeEncodeError_GetEnd(exc, &end))
767 return NULL;
768 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769 return NULL;
770 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771 end = start + PY_SSIZE_T_MAX / (2+7+1);
772 for (i = start, ressize = 0; i < end; ++i) {
773 /* object is guaranteed to be "ready" */
774 ch = PyUnicode_READ_CHAR(object, i);
775 if (ch<10)
776 ressize += 2+1+1;
777 else if (ch<100)
778 ressize += 2+2+1;
779 else if (ch<1000)
780 ressize += 2+3+1;
781 else if (ch<10000)
782 ressize += 2+4+1;
783 else if (ch<100000)
784 ressize += 2+5+1;
785 else if (ch<1000000)
786 ressize += 2+6+1;
787 else
788 ressize += 2+7+1;
789 }
790 /* allocate replacement */
791 res = PyUnicode_New(ressize, 127);
792 if (res == NULL) {
793 Py_DECREF(object);
794 return NULL;
795 }
796 outp = PyUnicode_1BYTE_DATA(res);
797 /* generate replacement */
798 for (i = start; i < end; ++i) {
799 int digits;
800 int base;
801 ch = PyUnicode_READ_CHAR(object, i);
802 *outp++ = '&';
803 *outp++ = '#';
804 if (ch<10) {
805 digits = 1;
806 base = 1;
807 }
808 else if (ch<100) {
809 digits = 2;
810 base = 10;
811 }
812 else if (ch<1000) {
813 digits = 3;
814 base = 100;
815 }
816 else if (ch<10000) {
817 digits = 4;
818 base = 1000;
819 }
820 else if (ch<100000) {
821 digits = 5;
822 base = 10000;
823 }
824 else if (ch<1000000) {
825 digits = 6;
826 base = 100000;
827 }
828 else {
829 digits = 7;
830 base = 1000000;
831 }
832 while (digits-->0) {
833 *outp++ = '0' + ch/base;
834 ch %= base;
835 base /= 10;
836 }
837 *outp++ = ';';
838 }
839 assert(_PyUnicode_CheckConsistency(res, 1));
840 restuple = Py_BuildValue("(Nn)", res, end);
841 Py_DECREF(object);
842 return restuple;
843 }
844 else {
845 wrong_exception_type(exc);
846 return NULL;
847 }
848 }
849
PyCodec_BackslashReplaceErrors(PyObject * exc)850 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851 {
852 PyObject *object;
853 Py_ssize_t i;
854 Py_ssize_t start;
855 Py_ssize_t end;
856 PyObject *res;
857 Py_UCS1 *outp;
858 int ressize;
859 Py_UCS4 c;
860
861 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862 const unsigned char *p;
863 if (PyUnicodeDecodeError_GetStart(exc, &start))
864 return NULL;
865 if (PyUnicodeDecodeError_GetEnd(exc, &end))
866 return NULL;
867 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868 return NULL;
869 p = (const unsigned char*)PyBytes_AS_STRING(object);
870 res = PyUnicode_New(4 * (end - start), 127);
871 if (res == NULL) {
872 Py_DECREF(object);
873 return NULL;
874 }
875 outp = PyUnicode_1BYTE_DATA(res);
876 for (i = start; i < end; i++, outp += 4) {
877 unsigned char c = p[i];
878 outp[0] = '\\';
879 outp[1] = 'x';
880 outp[2] = Py_hexdigits[(c>>4)&0xf];
881 outp[3] = Py_hexdigits[c&0xf];
882 }
883
884 assert(_PyUnicode_CheckConsistency(res, 1));
885 Py_DECREF(object);
886 return Py_BuildValue("(Nn)", res, end);
887 }
888 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889 if (PyUnicodeEncodeError_GetStart(exc, &start))
890 return NULL;
891 if (PyUnicodeEncodeError_GetEnd(exc, &end))
892 return NULL;
893 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894 return NULL;
895 }
896 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897 if (PyUnicodeTranslateError_GetStart(exc, &start))
898 return NULL;
899 if (PyUnicodeTranslateError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902 return NULL;
903 }
904 else {
905 wrong_exception_type(exc);
906 return NULL;
907 }
908
909 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910 end = start + PY_SSIZE_T_MAX / (1+1+8);
911 for (i = start, ressize = 0; i < end; ++i) {
912 /* object is guaranteed to be "ready" */
913 c = PyUnicode_READ_CHAR(object, i);
914 if (c >= 0x10000) {
915 ressize += 1+1+8;
916 }
917 else if (c >= 0x100) {
918 ressize += 1+1+4;
919 }
920 else
921 ressize += 1+1+2;
922 }
923 res = PyUnicode_New(ressize, 127);
924 if (res == NULL) {
925 Py_DECREF(object);
926 return NULL;
927 }
928 outp = PyUnicode_1BYTE_DATA(res);
929 for (i = start; i < end; ++i) {
930 c = PyUnicode_READ_CHAR(object, i);
931 *outp++ = '\\';
932 if (c >= 0x00010000) {
933 *outp++ = 'U';
934 *outp++ = Py_hexdigits[(c>>28)&0xf];
935 *outp++ = Py_hexdigits[(c>>24)&0xf];
936 *outp++ = Py_hexdigits[(c>>20)&0xf];
937 *outp++ = Py_hexdigits[(c>>16)&0xf];
938 *outp++ = Py_hexdigits[(c>>12)&0xf];
939 *outp++ = Py_hexdigits[(c>>8)&0xf];
940 }
941 else if (c >= 0x100) {
942 *outp++ = 'u';
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else
947 *outp++ = 'x';
948 *outp++ = Py_hexdigits[(c>>4)&0xf];
949 *outp++ = Py_hexdigits[c&0xf];
950 }
951
952 assert(_PyUnicode_CheckConsistency(res, 1));
953 Py_DECREF(object);
954 return Py_BuildValue("(Nn)", res, end);
955 }
956
957 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
958
PyCodec_NameReplaceErrors(PyObject * exc)959 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960 {
961 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962 PyObject *restuple;
963 PyObject *object;
964 Py_ssize_t i;
965 Py_ssize_t start;
966 Py_ssize_t end;
967 PyObject *res;
968 Py_UCS1 *outp;
969 Py_ssize_t ressize;
970 int replsize;
971 Py_UCS4 c;
972 char buffer[256]; /* NAME_MAXLEN */
973 if (PyUnicodeEncodeError_GetStart(exc, &start))
974 return NULL;
975 if (PyUnicodeEncodeError_GetEnd(exc, &end))
976 return NULL;
977 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978 return NULL;
979 if (!ucnhash_capi) {
980 /* load the unicode data module */
981 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982 PyUnicodeData_CAPSULE_NAME, 1);
983 if (!ucnhash_capi) {
984 return NULL;
985 }
986 }
987 for (i = start, ressize = 0; i < end; ++i) {
988 /* object is guaranteed to be "ready" */
989 c = PyUnicode_READ_CHAR(object, i);
990 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
991 replsize = 1+1+1+(int)strlen(buffer)+1;
992 }
993 else if (c >= 0x10000) {
994 replsize = 1+1+8;
995 }
996 else if (c >= 0x100) {
997 replsize = 1+1+4;
998 }
999 else
1000 replsize = 1+1+2;
1001 if (ressize > PY_SSIZE_T_MAX - replsize)
1002 break;
1003 ressize += replsize;
1004 }
1005 end = i;
1006 res = PyUnicode_New(ressize, 127);
1007 if (res==NULL)
1008 return NULL;
1009 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010 i < end; ++i) {
1011 c = PyUnicode_READ_CHAR(object, i);
1012 *outp++ = '\\';
1013 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1014 *outp++ = 'N';
1015 *outp++ = '{';
1016 strcpy((char *)outp, buffer);
1017 outp += strlen(buffer);
1018 *outp++ = '}';
1019 continue;
1020 }
1021 if (c >= 0x00010000) {
1022 *outp++ = 'U';
1023 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029 }
1030 else if (c >= 0x100) {
1031 *outp++ = 'u';
1032 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034 }
1035 else
1036 *outp++ = 'x';
1037 *outp++ = Py_hexdigits[(c>>4)&0xf];
1038 *outp++ = Py_hexdigits[c&0xf];
1039 }
1040
1041 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1042 assert(_PyUnicode_CheckConsistency(res, 1));
1043 restuple = Py_BuildValue("(Nn)", res, end);
1044 Py_DECREF(object);
1045 return restuple;
1046 }
1047 else {
1048 wrong_exception_type(exc);
1049 return NULL;
1050 }
1051 }
1052
1053 #define ENC_UNKNOWN -1
1054 #define ENC_UTF8 0
1055 #define ENC_UTF16BE 1
1056 #define ENC_UTF16LE 2
1057 #define ENC_UTF32BE 3
1058 #define ENC_UTF32LE 4
1059
1060 static int
get_standard_encoding(const char * encoding,int * bytelength)1061 get_standard_encoding(const char *encoding, int *bytelength)
1062 {
1063 if (Py_TOLOWER(encoding[0]) == 'u' &&
1064 Py_TOLOWER(encoding[1]) == 't' &&
1065 Py_TOLOWER(encoding[2]) == 'f') {
1066 encoding += 3;
1067 if (*encoding == '-' || *encoding == '_' )
1068 encoding++;
1069 if (encoding[0] == '8' && encoding[1] == '\0') {
1070 *bytelength = 3;
1071 return ENC_UTF8;
1072 }
1073 else if (encoding[0] == '1' && encoding[1] == '6') {
1074 encoding += 2;
1075 *bytelength = 2;
1076 if (*encoding == '\0') {
1077 #ifdef WORDS_BIGENDIAN
1078 return ENC_UTF16BE;
1079 #else
1080 return ENC_UTF16LE;
1081 #endif
1082 }
1083 if (*encoding == '-' || *encoding == '_' )
1084 encoding++;
1085 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086 if (Py_TOLOWER(encoding[0]) == 'b')
1087 return ENC_UTF16BE;
1088 if (Py_TOLOWER(encoding[0]) == 'l')
1089 return ENC_UTF16LE;
1090 }
1091 }
1092 else if (encoding[0] == '3' && encoding[1] == '2') {
1093 encoding += 2;
1094 *bytelength = 4;
1095 if (*encoding == '\0') {
1096 #ifdef WORDS_BIGENDIAN
1097 return ENC_UTF32BE;
1098 #else
1099 return ENC_UTF32LE;
1100 #endif
1101 }
1102 if (*encoding == '-' || *encoding == '_' )
1103 encoding++;
1104 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105 if (Py_TOLOWER(encoding[0]) == 'b')
1106 return ENC_UTF32BE;
1107 if (Py_TOLOWER(encoding[0]) == 'l')
1108 return ENC_UTF32LE;
1109 }
1110 }
1111 }
1112 else if (strcmp(encoding, "CP_UTF8") == 0) {
1113 *bytelength = 3;
1114 return ENC_UTF8;
1115 }
1116 return ENC_UNKNOWN;
1117 }
1118
1119 /* This handler is declared static until someone demonstrates
1120 a need to call it directly. */
1121 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1122 PyCodec_SurrogatePassErrors(PyObject *exc)
1123 {
1124 PyObject *restuple;
1125 PyObject *object;
1126 PyObject *encode;
1127 const char *encoding;
1128 int code;
1129 int bytelength;
1130 Py_ssize_t i;
1131 Py_ssize_t start;
1132 Py_ssize_t end;
1133 PyObject *res;
1134
1135 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1136 unsigned char *outp;
1137 if (PyUnicodeEncodeError_GetStart(exc, &start))
1138 return NULL;
1139 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140 return NULL;
1141 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142 return NULL;
1143 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144 Py_DECREF(object);
1145 return NULL;
1146 }
1147 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148 Py_DECREF(object);
1149 Py_DECREF(encode);
1150 return NULL;
1151 }
1152 code = get_standard_encoding(encoding, &bytelength);
1153 Py_DECREF(encode);
1154 if (code == ENC_UNKNOWN) {
1155 /* Not supported, fail with original exception */
1156 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157 Py_DECREF(object);
1158 return NULL;
1159 }
1160
1161 if (end - start > PY_SSIZE_T_MAX / bytelength)
1162 end = start + PY_SSIZE_T_MAX / bytelength;
1163 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1164 if (!res) {
1165 Py_DECREF(object);
1166 return NULL;
1167 }
1168 outp = (unsigned char*)PyBytes_AsString(res);
1169 for (i = start; i < end; i++) {
1170 /* object is guaranteed to be "ready" */
1171 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1172 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1173 /* Not a surrogate, fail with original exception */
1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175 Py_DECREF(res);
1176 Py_DECREF(object);
1177 return NULL;
1178 }
1179 switch (code) {
1180 case ENC_UTF8:
1181 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184 break;
1185 case ENC_UTF16LE:
1186 *outp++ = (unsigned char) ch;
1187 *outp++ = (unsigned char)(ch >> 8);
1188 break;
1189 case ENC_UTF16BE:
1190 *outp++ = (unsigned char)(ch >> 8);
1191 *outp++ = (unsigned char) ch;
1192 break;
1193 case ENC_UTF32LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 *outp++ = (unsigned char)(ch >> 16);
1197 *outp++ = (unsigned char)(ch >> 24);
1198 break;
1199 case ENC_UTF32BE:
1200 *outp++ = (unsigned char)(ch >> 24);
1201 *outp++ = (unsigned char)(ch >> 16);
1202 *outp++ = (unsigned char)(ch >> 8);
1203 *outp++ = (unsigned char) ch;
1204 break;
1205 }
1206 }
1207 restuple = Py_BuildValue("(On)", res, end);
1208 Py_DECREF(res);
1209 Py_DECREF(object);
1210 return restuple;
1211 }
1212 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1213 const unsigned char *p;
1214 Py_UCS4 ch = 0;
1215 if (PyUnicodeDecodeError_GetStart(exc, &start))
1216 return NULL;
1217 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218 return NULL;
1219 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220 return NULL;
1221 p = (const unsigned char*)PyBytes_AS_STRING(object);
1222 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223 Py_DECREF(object);
1224 return NULL;
1225 }
1226 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227 Py_DECREF(object);
1228 Py_DECREF(encode);
1229 return NULL;
1230 }
1231 code = get_standard_encoding(encoding, &bytelength);
1232 Py_DECREF(encode);
1233 if (code == ENC_UNKNOWN) {
1234 /* Not supported, fail with original exception */
1235 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236 Py_DECREF(object);
1237 return NULL;
1238 }
1239
1240 /* Try decoding a single surrogate character. If
1241 there are more, let the codec call us again. */
1242 p += start;
1243 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244 switch (code) {
1245 case ENC_UTF8:
1246 if ((p[0] & 0xf0) == 0xe0 &&
1247 (p[1] & 0xc0) == 0x80 &&
1248 (p[2] & 0xc0) == 0x80) {
1249 /* it's a three-byte code */
1250 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251 }
1252 break;
1253 case ENC_UTF16LE:
1254 ch = p[1] << 8 | p[0];
1255 break;
1256 case ENC_UTF16BE:
1257 ch = p[0] << 8 | p[1];
1258 break;
1259 case ENC_UTF32LE:
1260 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261 break;
1262 case ENC_UTF32BE:
1263 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264 break;
1265 }
1266 }
1267
1268 Py_DECREF(object);
1269 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270 /* it's not a surrogate - fail */
1271 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272 return NULL;
1273 }
1274 res = PyUnicode_FromOrdinal(ch);
1275 if (res == NULL)
1276 return NULL;
1277 return Py_BuildValue("(Nn)", res, start + bytelength);
1278 }
1279 else {
1280 wrong_exception_type(exc);
1281 return NULL;
1282 }
1283 }
1284
1285 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1286 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1287 {
1288 PyObject *restuple;
1289 PyObject *object;
1290 Py_ssize_t i;
1291 Py_ssize_t start;
1292 Py_ssize_t end;
1293 PyObject *res;
1294
1295 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1296 char *outp;
1297 if (PyUnicodeEncodeError_GetStart(exc, &start))
1298 return NULL;
1299 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300 return NULL;
1301 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302 return NULL;
1303 res = PyBytes_FromStringAndSize(NULL, end-start);
1304 if (!res) {
1305 Py_DECREF(object);
1306 return NULL;
1307 }
1308 outp = PyBytes_AsString(res);
1309 for (i = start; i < end; i++) {
1310 /* object is guaranteed to be "ready" */
1311 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1312 if (ch < 0xdc80 || ch > 0xdcff) {
1313 /* Not a UTF-8b surrogate, fail with original exception */
1314 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315 Py_DECREF(res);
1316 Py_DECREF(object);
1317 return NULL;
1318 }
1319 *outp++ = ch - 0xdc00;
1320 }
1321 restuple = Py_BuildValue("(On)", res, end);
1322 Py_DECREF(res);
1323 Py_DECREF(object);
1324 return restuple;
1325 }
1326 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1327 PyObject *str;
1328 const unsigned char *p;
1329 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1330 int consumed = 0;
1331 if (PyUnicodeDecodeError_GetStart(exc, &start))
1332 return NULL;
1333 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334 return NULL;
1335 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336 return NULL;
1337 p = (const unsigned char*)PyBytes_AS_STRING(object);
1338 while (consumed < 4 && consumed < end-start) {
1339 /* Refuse to escape ASCII bytes. */
1340 if (p[start+consumed] < 128)
1341 break;
1342 ch[consumed] = 0xdc00 + p[start+consumed];
1343 consumed++;
1344 }
1345 Py_DECREF(object);
1346 if (!consumed) {
1347 /* codec complained about ASCII byte. */
1348 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349 return NULL;
1350 }
1351 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352 if (str == NULL)
1353 return NULL;
1354 return Py_BuildValue("(Nn)", str, start+consumed);
1355 }
1356 else {
1357 wrong_exception_type(exc);
1358 return NULL;
1359 }
1360 }
1361
1362
strict_errors(PyObject * self,PyObject * exc)1363 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364 {
1365 return PyCodec_StrictErrors(exc);
1366 }
1367
1368
ignore_errors(PyObject * self,PyObject * exc)1369 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370 {
1371 return PyCodec_IgnoreErrors(exc);
1372 }
1373
1374
replace_errors(PyObject * self,PyObject * exc)1375 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376 {
1377 return PyCodec_ReplaceErrors(exc);
1378 }
1379
1380
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1381 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382 {
1383 return PyCodec_XMLCharRefReplaceErrors(exc);
1384 }
1385
1386
backslashreplace_errors(PyObject * self,PyObject * exc)1387 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388 {
1389 return PyCodec_BackslashReplaceErrors(exc);
1390 }
1391
namereplace_errors(PyObject * self,PyObject * exc)1392 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393 {
1394 return PyCodec_NameReplaceErrors(exc);
1395 }
1396
surrogatepass_errors(PyObject * self,PyObject * exc)1397 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1398 {
1399 return PyCodec_SurrogatePassErrors(exc);
1400 }
1401
surrogateescape_errors(PyObject * self,PyObject * exc)1402 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1403 {
1404 return PyCodec_SurrogateEscapeErrors(exc);
1405 }
1406
_PyCodecRegistry_Init(void)1407 static int _PyCodecRegistry_Init(void)
1408 {
1409 static struct {
1410 const char *name;
1411 PyMethodDef def;
1412 } methods[] =
1413 {
1414 {
1415 "strict",
1416 {
1417 "strict_errors",
1418 strict_errors,
1419 METH_O,
1420 PyDoc_STR("Implements the 'strict' error handling, which "
1421 "raises a UnicodeError on coding errors.")
1422 }
1423 },
1424 {
1425 "ignore",
1426 {
1427 "ignore_errors",
1428 ignore_errors,
1429 METH_O,
1430 PyDoc_STR("Implements the 'ignore' error handling, which "
1431 "ignores malformed data and continues.")
1432 }
1433 },
1434 {
1435 "replace",
1436 {
1437 "replace_errors",
1438 replace_errors,
1439 METH_O,
1440 PyDoc_STR("Implements the 'replace' error handling, which "
1441 "replaces malformed data with a replacement marker.")
1442 }
1443 },
1444 {
1445 "xmlcharrefreplace",
1446 {
1447 "xmlcharrefreplace_errors",
1448 xmlcharrefreplace_errors,
1449 METH_O,
1450 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451 "which replaces an unencodable character with the "
1452 "appropriate XML character reference.")
1453 }
1454 },
1455 {
1456 "backslashreplace",
1457 {
1458 "backslashreplace_errors",
1459 backslashreplace_errors,
1460 METH_O,
1461 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1462 "which replaces malformed data with a backslashed "
1463 "escape sequence.")
1464 }
1465 },
1466 {
1467 "namereplace",
1468 {
1469 "namereplace_errors",
1470 namereplace_errors,
1471 METH_O,
1472 PyDoc_STR("Implements the 'namereplace' error handling, "
1473 "which replaces an unencodable character with a "
1474 "\\N{...} escape sequence.")
1475 }
1476 },
1477 {
1478 "surrogatepass",
1479 {
1480 "surrogatepass",
1481 surrogatepass_errors,
1482 METH_O
1483 }
1484 },
1485 {
1486 "surrogateescape",
1487 {
1488 "surrogateescape",
1489 surrogateescape_errors,
1490 METH_O
1491 }
1492 }
1493 };
1494
1495 PyInterpreterState *interp = _PyInterpreterState_GET();
1496 PyObject *mod;
1497
1498 if (interp->codec_search_path != NULL)
1499 return 0;
1500
1501 interp->codec_search_path = PyList_New(0);
1502 if (interp->codec_search_path == NULL) {
1503 return -1;
1504 }
1505
1506 interp->codec_search_cache = PyDict_New();
1507 if (interp->codec_search_cache == NULL) {
1508 return -1;
1509 }
1510
1511 interp->codec_error_registry = PyDict_New();
1512 if (interp->codec_error_registry == NULL) {
1513 return -1;
1514 }
1515
1516 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1517 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518 if (!func) {
1519 return -1;
1520 }
1521
1522 int res = PyCodec_RegisterError(methods[i].name, func);
1523 Py_DECREF(func);
1524 if (res) {
1525 return -1;
1526 }
1527 }
1528
1529 mod = PyImport_ImportModule("encodings");
1530 if (mod == NULL) {
1531 return -1;
1532 }
1533 Py_DECREF(mod);
1534 interp->codecs_initialized = 1;
1535 return 0;
1536 }
1537