1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg ([email protected]).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "pycore_call.h"          // _PyObject_CallNoArgs()
13 #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
14 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
15 #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
16 #include <ctype.h>
17 
18 const char *Py_hexdigits = "0123456789abcdef";
19 
20 /* --- Codec Registry ----------------------------------------------------- */
21 
22 /* Import the standard encodings package which will register the first
23    codec search function.
24 
25    This is done in a lazy way so that the Unicode implementation does
26    not downgrade startup time of scripts not needing it.
27 
28    ImportErrors are silently ignored by this function. Only one try is
29    made.
30 
31 */
32 
33 static int _PyCodecRegistry_Init(void); /* Forward */
34 
PyCodec_Register(PyObject * search_function)35 int PyCodec_Register(PyObject *search_function)
36 {
37     PyInterpreterState *interp = _PyInterpreterState_GET();
38     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
39         goto onError;
40     if (search_function == NULL) {
41         PyErr_BadArgument();
42         goto onError;
43     }
44     if (!PyCallable_Check(search_function)) {
45         PyErr_SetString(PyExc_TypeError, "argument must be callable");
46         goto onError;
47     }
48     return PyList_Append(interp->codec_search_path, search_function);
49 
50  onError:
51     return -1;
52 }
53 
54 int
PyCodec_Unregister(PyObject * search_function)55 PyCodec_Unregister(PyObject *search_function)
56 {
57     PyInterpreterState *interp = PyInterpreterState_Get();
58     PyObject *codec_search_path = interp->codec_search_path;
59     /* Do nothing if codec_search_path is not created yet or was cleared. */
60     if (codec_search_path == NULL) {
61         return 0;
62     }
63 
64     assert(PyList_CheckExact(codec_search_path));
65     Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
66     for (Py_ssize_t i = 0; i < n; i++) {
67         PyObject *item = PyList_GET_ITEM(codec_search_path, i);
68         if (item == search_function) {
69             if (interp->codec_search_cache != NULL) {
70                 assert(PyDict_CheckExact(interp->codec_search_cache));
71                 PyDict_Clear(interp->codec_search_cache);
72             }
73             return PyList_SetSlice(codec_search_path, i, i+1, NULL);
74         }
75     }
76     return 0;
77 }
78 
79 extern int _Py_normalize_encoding(const char *, char *, size_t);
80 
81 /* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
82    converted to lower case, spaces and hyphens are replaced with underscores. */
83 
84 static
normalizestring(const char * string)85 PyObject *normalizestring(const char *string)
86 {
87     size_t len = strlen(string);
88     char *encoding;
89     PyObject *v;
90 
91     if (len > PY_SSIZE_T_MAX) {
92         PyErr_SetString(PyExc_OverflowError, "string is too large");
93         return NULL;
94     }
95 
96     encoding = PyMem_Malloc(len + 1);
97     if (encoding == NULL)
98         return PyErr_NoMemory();
99 
100     if (!_Py_normalize_encoding(string, encoding, len + 1))
101     {
102         PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
103         PyMem_Free(encoding);
104         return NULL;
105     }
106 
107     v = PyUnicode_FromString(encoding);
108     PyMem_Free(encoding);
109     return v;
110 }
111 
112 /* Lookup the given encoding and return a tuple providing the codec
113    facilities.
114 
115    The encoding string is looked up converted to all lower-case
116    characters. This makes encodings looked up through this mechanism
117    effectively case-insensitive.
118 
119    If no codec is found, a LookupError is set and NULL returned.
120 
121    As side effect, this tries to load the encodings package, if not
122    yet done. This is part of the lazy load strategy for the encodings
123    package.
124 
125 */
126 
_PyCodec_Lookup(const char * encoding)127 PyObject *_PyCodec_Lookup(const char *encoding)
128 {
129     if (encoding == NULL) {
130         PyErr_BadArgument();
131         return NULL;
132     }
133 
134     PyInterpreterState *interp = _PyInterpreterState_GET();
135     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
136         return NULL;
137     }
138 
139     /* Convert the encoding to a normalized Python string: all
140        characters are converted to lower case, spaces and hyphens are
141        replaced with underscores. */
142     PyObject *v = normalizestring(encoding);
143     if (v == NULL) {
144         return NULL;
145     }
146     PyUnicode_InternInPlace(&v);
147 
148     /* First, try to lookup the name in the registry dictionary */
149     PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
150     if (result != NULL) {
151         Py_INCREF(result);
152         Py_DECREF(v);
153         return result;
154     }
155     else if (PyErr_Occurred()) {
156         goto onError;
157     }
158 
159     /* Next, scan the search functions in order of registration */
160     const Py_ssize_t len = PyList_Size(interp->codec_search_path);
161     if (len < 0)
162         goto onError;
163     if (len == 0) {
164         PyErr_SetString(PyExc_LookupError,
165                         "no codec search functions registered: "
166                         "can't find encoding");
167         goto onError;
168     }
169 
170     Py_ssize_t i;
171     for (i = 0; i < len; i++) {
172         PyObject *func;
173 
174         func = PyList_GetItem(interp->codec_search_path, i);
175         if (func == NULL)
176             goto onError;
177         result = PyObject_CallOneArg(func, v);
178         if (result == NULL)
179             goto onError;
180         if (result == Py_None) {
181             Py_DECREF(result);
182             continue;
183         }
184         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
185             PyErr_SetString(PyExc_TypeError,
186                             "codec search functions must return 4-tuples");
187             Py_DECREF(result);
188             goto onError;
189         }
190         break;
191     }
192     if (i == len) {
193         /* XXX Perhaps we should cache misses too ? */
194         PyErr_Format(PyExc_LookupError,
195                      "unknown encoding: %s", encoding);
196         goto onError;
197     }
198 
199     /* Cache and return the result */
200     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
201         Py_DECREF(result);
202         goto onError;
203     }
204     Py_DECREF(v);
205     return result;
206 
207  onError:
208     Py_DECREF(v);
209     return NULL;
210 }
211 
212 /* Codec registry encoding check API. */
213 
PyCodec_KnownEncoding(const char * encoding)214 int PyCodec_KnownEncoding(const char *encoding)
215 {
216     PyObject *codecs;
217 
218     codecs = _PyCodec_Lookup(encoding);
219     if (!codecs) {
220         PyErr_Clear();
221         return 0;
222     }
223     else {
224         Py_DECREF(codecs);
225         return 1;
226     }
227 }
228 
229 static
args_tuple(PyObject * object,const char * errors)230 PyObject *args_tuple(PyObject *object,
231                      const char *errors)
232 {
233     PyObject *args;
234 
235     args = PyTuple_New(1 + (errors != NULL));
236     if (args == NULL)
237         return NULL;
238     Py_INCREF(object);
239     PyTuple_SET_ITEM(args,0,object);
240     if (errors) {
241         PyObject *v;
242 
243         v = PyUnicode_FromString(errors);
244         if (v == NULL) {
245             Py_DECREF(args);
246             return NULL;
247         }
248         PyTuple_SET_ITEM(args, 1, v);
249     }
250     return args;
251 }
252 
253 /* Helper function to get a codec item */
254 
255 static
codec_getitem(const char * encoding,int index)256 PyObject *codec_getitem(const char *encoding, int index)
257 {
258     PyObject *codecs;
259     PyObject *v;
260 
261     codecs = _PyCodec_Lookup(encoding);
262     if (codecs == NULL)
263         return NULL;
264     v = PyTuple_GET_ITEM(codecs, index);
265     Py_DECREF(codecs);
266     Py_INCREF(v);
267     return v;
268 }
269 
270 /* Helper functions to create an incremental codec. */
271 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)272 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
273                                      const char *errors,
274                                      const char *attrname)
275 {
276     PyObject *ret, *inccodec;
277 
278     inccodec = PyObject_GetAttrString(codec_info, attrname);
279     if (inccodec == NULL)
280         return NULL;
281     if (errors)
282         ret = PyObject_CallFunction(inccodec, "s", errors);
283     else
284         ret = _PyObject_CallNoArgs(inccodec);
285     Py_DECREF(inccodec);
286     return ret;
287 }
288 
289 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)290 PyObject *codec_getincrementalcodec(const char *encoding,
291                                     const char *errors,
292                                     const char *attrname)
293 {
294     PyObject *codec_info, *ret;
295 
296     codec_info = _PyCodec_Lookup(encoding);
297     if (codec_info == NULL)
298         return NULL;
299     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
300     Py_DECREF(codec_info);
301     return ret;
302 }
303 
304 /* Helper function to create a stream codec. */
305 
306 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)307 PyObject *codec_getstreamcodec(const char *encoding,
308                                PyObject *stream,
309                                const char *errors,
310                                const int index)
311 {
312     PyObject *codecs, *streamcodec, *codeccls;
313 
314     codecs = _PyCodec_Lookup(encoding);
315     if (codecs == NULL)
316         return NULL;
317 
318     codeccls = PyTuple_GET_ITEM(codecs, index);
319     if (errors != NULL)
320         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
321     else
322         streamcodec = PyObject_CallOneArg(codeccls, stream);
323     Py_DECREF(codecs);
324     return streamcodec;
325 }
326 
327 /* Helpers to work with the result of _PyCodec_Lookup
328 
329  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)330 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
331                                              const char *errors)
332 {
333     return codec_makeincrementalcodec(codec_info, errors,
334                                       "incrementaldecoder");
335 }
336 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)337 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
338                                              const char *errors)
339 {
340     return codec_makeincrementalcodec(codec_info, errors,
341                                       "incrementalencoder");
342 }
343 
344 
345 /* Convenience APIs to query the Codec registry.
346 
347    All APIs return a codec object with incremented refcount.
348 
349  */
350 
PyCodec_Encoder(const char * encoding)351 PyObject *PyCodec_Encoder(const char *encoding)
352 {
353     return codec_getitem(encoding, 0);
354 }
355 
PyCodec_Decoder(const char * encoding)356 PyObject *PyCodec_Decoder(const char *encoding)
357 {
358     return codec_getitem(encoding, 1);
359 }
360 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)361 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
362                                      const char *errors)
363 {
364     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
365 }
366 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
368                                      const char *errors)
369 {
370     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
371 }
372 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)373 PyObject *PyCodec_StreamReader(const char *encoding,
374                                PyObject *stream,
375                                const char *errors)
376 {
377     return codec_getstreamcodec(encoding, stream, errors, 2);
378 }
379 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)380 PyObject *PyCodec_StreamWriter(const char *encoding,
381                                PyObject *stream,
382                                const char *errors)
383 {
384     return codec_getstreamcodec(encoding, stream, errors, 3);
385 }
386 
387 /* Helper that tries to ensure the reported exception chain indicates the
388  * codec that was invoked to trigger the failure without changing the type
389  * of the exception raised.
390  */
391 static void
wrap_codec_error(const char * operation,const char * encoding)392 wrap_codec_error(const char *operation,
393                  const char *encoding)
394 {
395     /* TrySetFromCause will replace the active exception with a suitably
396      * updated clone if it can, otherwise it will leave the original
397      * exception alone.
398      */
399     _PyErr_TrySetFromCause("%s with '%s' codec failed",
400                            operation, encoding);
401 }
402 
403 /* Encode an object (e.g. a Unicode object) using the given encoding
404    and return the resulting encoded object (usually a Python string).
405 
406    errors is passed to the encoder factory as argument if non-NULL. */
407 
408 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)409 _PyCodec_EncodeInternal(PyObject *object,
410                         PyObject *encoder,
411                         const char *encoding,
412                         const char *errors)
413 {
414     PyObject *args = NULL, *result = NULL;
415     PyObject *v = NULL;
416 
417     args = args_tuple(object, errors);
418     if (args == NULL)
419         goto onError;
420 
421     result = PyObject_Call(encoder, args, NULL);
422     if (result == NULL) {
423         wrap_codec_error("encoding", encoding);
424         goto onError;
425     }
426 
427     if (!PyTuple_Check(result) ||
428         PyTuple_GET_SIZE(result) != 2) {
429         PyErr_SetString(PyExc_TypeError,
430                         "encoder must return a tuple (object, integer)");
431         goto onError;
432     }
433     v = PyTuple_GET_ITEM(result,0);
434     Py_INCREF(v);
435     /* We don't check or use the second (integer) entry. */
436 
437     Py_DECREF(args);
438     Py_DECREF(encoder);
439     Py_DECREF(result);
440     return v;
441 
442  onError:
443     Py_XDECREF(result);
444     Py_XDECREF(args);
445     Py_XDECREF(encoder);
446     return NULL;
447 }
448 
449 /* Decode an object (usually a Python string) using the given encoding
450    and return an equivalent object (e.g. a Unicode object).
451 
452    errors is passed to the decoder factory as argument if non-NULL. */
453 
454 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)455 _PyCodec_DecodeInternal(PyObject *object,
456                         PyObject *decoder,
457                         const char *encoding,
458                         const char *errors)
459 {
460     PyObject *args = NULL, *result = NULL;
461     PyObject *v;
462 
463     args = args_tuple(object, errors);
464     if (args == NULL)
465         goto onError;
466 
467     result = PyObject_Call(decoder, args, NULL);
468     if (result == NULL) {
469         wrap_codec_error("decoding", encoding);
470         goto onError;
471     }
472     if (!PyTuple_Check(result) ||
473         PyTuple_GET_SIZE(result) != 2) {
474         PyErr_SetString(PyExc_TypeError,
475                         "decoder must return a tuple (object,integer)");
476         goto onError;
477     }
478     v = PyTuple_GET_ITEM(result,0);
479     Py_INCREF(v);
480     /* We don't check or use the second (integer) entry. */
481 
482     Py_DECREF(args);
483     Py_DECREF(decoder);
484     Py_DECREF(result);
485     return v;
486 
487  onError:
488     Py_XDECREF(args);
489     Py_XDECREF(decoder);
490     Py_XDECREF(result);
491     return NULL;
492 }
493 
494 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)495 PyObject *PyCodec_Encode(PyObject *object,
496                          const char *encoding,
497                          const char *errors)
498 {
499     PyObject *encoder;
500 
501     encoder = PyCodec_Encoder(encoding);
502     if (encoder == NULL)
503         return NULL;
504 
505     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
506 }
507 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)508 PyObject *PyCodec_Decode(PyObject *object,
509                          const char *encoding,
510                          const char *errors)
511 {
512     PyObject *decoder;
513 
514     decoder = PyCodec_Decoder(encoding);
515     if (decoder == NULL)
516         return NULL;
517 
518     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
519 }
520 
521 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)522 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
523                                        const char *alternate_command)
524 {
525     PyObject *codec;
526     PyObject *attr;
527     int is_text_codec;
528 
529     codec = _PyCodec_Lookup(encoding);
530     if (codec == NULL)
531         return NULL;
532 
533     /* Backwards compatibility: assume any raw tuple describes a text
534      * encoding, and the same for anything lacking the private
535      * attribute.
536      */
537     if (!PyTuple_CheckExact(codec)) {
538         if (_PyObject_LookupAttr(codec, &_Py_ID(_is_text_encoding), &attr) < 0) {
539             Py_DECREF(codec);
540             return NULL;
541         }
542         if (attr != NULL) {
543             is_text_codec = PyObject_IsTrue(attr);
544             Py_DECREF(attr);
545             if (is_text_codec <= 0) {
546                 Py_DECREF(codec);
547                 if (!is_text_codec)
548                     PyErr_Format(PyExc_LookupError,
549                                  "'%.400s' is not a text encoding; "
550                                  "use %s to handle arbitrary codecs",
551                                  encoding, alternate_command);
552                 return NULL;
553             }
554         }
555     }
556 
557     /* This appears to be a valid text encoding */
558     return codec;
559 }
560 
561 
562 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)563 PyObject *codec_getitem_checked(const char *encoding,
564                                 const char *alternate_command,
565                                 int index)
566 {
567     PyObject *codec;
568     PyObject *v;
569 
570     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571     if (codec == NULL)
572         return NULL;
573 
574     v = PyTuple_GET_ITEM(codec, index);
575     Py_INCREF(v);
576     Py_DECREF(codec);
577     return v;
578 }
579 
_PyCodec_TextEncoder(const char * encoding)580 static PyObject * _PyCodec_TextEncoder(const char *encoding)
581 {
582     return codec_getitem_checked(encoding, "codecs.encode()", 0);
583 }
584 
_PyCodec_TextDecoder(const char * encoding)585 static PyObject * _PyCodec_TextDecoder(const char *encoding)
586 {
587     return codec_getitem_checked(encoding, "codecs.decode()", 1);
588 }
589 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)590 PyObject *_PyCodec_EncodeText(PyObject *object,
591                               const char *encoding,
592                               const char *errors)
593 {
594     PyObject *encoder;
595 
596     encoder = _PyCodec_TextEncoder(encoding);
597     if (encoder == NULL)
598         return NULL;
599 
600     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601 }
602 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)603 PyObject *_PyCodec_DecodeText(PyObject *object,
604                               const char *encoding,
605                               const char *errors)
606 {
607     PyObject *decoder;
608 
609     decoder = _PyCodec_TextDecoder(encoding);
610     if (decoder == NULL)
611         return NULL;
612 
613     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614 }
615 
616 /* Register the error handling callback function error under the name
617    name. This function will be called by the codec when it encounters
618    an unencodable characters/undecodable bytes and doesn't know the
619    callback name, when name is specified as the error parameter
620    in the call to the encode/decode function.
621    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)622 int PyCodec_RegisterError(const char *name, PyObject *error)
623 {
624     PyInterpreterState *interp = _PyInterpreterState_GET();
625     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
626         return -1;
627     if (!PyCallable_Check(error)) {
628         PyErr_SetString(PyExc_TypeError, "handler must be callable");
629         return -1;
630     }
631     return PyDict_SetItemString(interp->codec_error_registry,
632                                 name, error);
633 }
634 
635 /* Lookup the error handling callback function registered under the
636    name error. As a special case NULL can be passed, in which case
637    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)638 PyObject *PyCodec_LookupError(const char *name)
639 {
640     PyObject *handler = NULL;
641 
642     PyInterpreterState *interp = _PyInterpreterState_GET();
643     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
644         return NULL;
645 
646     if (name==NULL)
647         name = "strict";
648     handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649     if (handler) {
650         Py_INCREF(handler);
651     }
652     else if (!PyErr_Occurred()) {
653         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654     }
655     return handler;
656 }
657 
wrong_exception_type(PyObject * exc)658 static void wrong_exception_type(PyObject *exc)
659 {
660     PyErr_Format(PyExc_TypeError,
661                  "don't know how to handle %.200s in error callback",
662                  Py_TYPE(exc)->tp_name);
663 }
664 
PyCodec_StrictErrors(PyObject * exc)665 PyObject *PyCodec_StrictErrors(PyObject *exc)
666 {
667     if (PyExceptionInstance_Check(exc))
668         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
669     else
670         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
671     return NULL;
672 }
673 
674 
PyCodec_IgnoreErrors(PyObject * exc)675 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676 {
677     Py_ssize_t end;
678 
679     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
680         if (PyUnicodeEncodeError_GetEnd(exc, &end))
681             return NULL;
682     }
683     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
684         if (PyUnicodeDecodeError_GetEnd(exc, &end))
685             return NULL;
686     }
687     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
688         if (PyUnicodeTranslateError_GetEnd(exc, &end))
689             return NULL;
690     }
691     else {
692         wrong_exception_type(exc);
693         return NULL;
694     }
695     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
696 }
697 
698 
PyCodec_ReplaceErrors(PyObject * exc)699 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700 {
701     Py_ssize_t start, end, i, len;
702 
703     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
704         PyObject *res;
705         Py_UCS1 *outp;
706         if (PyUnicodeEncodeError_GetStart(exc, &start))
707             return NULL;
708         if (PyUnicodeEncodeError_GetEnd(exc, &end))
709             return NULL;
710         len = end - start;
711         res = PyUnicode_New(len, '?');
712         if (res == NULL)
713             return NULL;
714         assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715         outp = PyUnicode_1BYTE_DATA(res);
716         for (i = 0; i < len; ++i)
717             outp[i] = '?';
718         assert(_PyUnicode_CheckConsistency(res, 1));
719         return Py_BuildValue("(Nn)", res, end);
720     }
721     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
722         if (PyUnicodeDecodeError_GetEnd(exc, &end))
723             return NULL;
724         return Py_BuildValue("(Cn)",
725                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726                              end);
727     }
728     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
729         PyObject *res;
730         Py_UCS2 *outp;
731         if (PyUnicodeTranslateError_GetStart(exc, &start))
732             return NULL;
733         if (PyUnicodeTranslateError_GetEnd(exc, &end))
734             return NULL;
735         len = end - start;
736         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
737         if (res == NULL)
738             return NULL;
739         assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740         outp = PyUnicode_2BYTE_DATA(res);
741         for (i = 0; i < len; i++)
742             outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
743         assert(_PyUnicode_CheckConsistency(res, 1));
744         return Py_BuildValue("(Nn)", res, end);
745     }
746     else {
747         wrong_exception_type(exc);
748         return NULL;
749     }
750 }
751 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)752 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753 {
754     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
755         PyObject *restuple;
756         PyObject *object;
757         Py_ssize_t i;
758         Py_ssize_t start;
759         Py_ssize_t end;
760         PyObject *res;
761         Py_UCS1 *outp;
762         Py_ssize_t ressize;
763         Py_UCS4 ch;
764         if (PyUnicodeEncodeError_GetStart(exc, &start))
765             return NULL;
766         if (PyUnicodeEncodeError_GetEnd(exc, &end))
767             return NULL;
768         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769             return NULL;
770         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771             end = start + PY_SSIZE_T_MAX / (2+7+1);
772         for (i = start, ressize = 0; i < end; ++i) {
773             /* object is guaranteed to be "ready" */
774             ch = PyUnicode_READ_CHAR(object, i);
775             if (ch<10)
776                 ressize += 2+1+1;
777             else if (ch<100)
778                 ressize += 2+2+1;
779             else if (ch<1000)
780                 ressize += 2+3+1;
781             else if (ch<10000)
782                 ressize += 2+4+1;
783             else if (ch<100000)
784                 ressize += 2+5+1;
785             else if (ch<1000000)
786                 ressize += 2+6+1;
787             else
788                 ressize += 2+7+1;
789         }
790         /* allocate replacement */
791         res = PyUnicode_New(ressize, 127);
792         if (res == NULL) {
793             Py_DECREF(object);
794             return NULL;
795         }
796         outp = PyUnicode_1BYTE_DATA(res);
797         /* generate replacement */
798         for (i = start; i < end; ++i) {
799             int digits;
800             int base;
801             ch = PyUnicode_READ_CHAR(object, i);
802             *outp++ = '&';
803             *outp++ = '#';
804             if (ch<10) {
805                 digits = 1;
806                 base = 1;
807             }
808             else if (ch<100) {
809                 digits = 2;
810                 base = 10;
811             }
812             else if (ch<1000) {
813                 digits = 3;
814                 base = 100;
815             }
816             else if (ch<10000) {
817                 digits = 4;
818                 base = 1000;
819             }
820             else if (ch<100000) {
821                 digits = 5;
822                 base = 10000;
823             }
824             else if (ch<1000000) {
825                 digits = 6;
826                 base = 100000;
827             }
828             else {
829                 digits = 7;
830                 base = 1000000;
831             }
832             while (digits-->0) {
833                 *outp++ = '0' + ch/base;
834                 ch %= base;
835                 base /= 10;
836             }
837             *outp++ = ';';
838         }
839         assert(_PyUnicode_CheckConsistency(res, 1));
840         restuple = Py_BuildValue("(Nn)", res, end);
841         Py_DECREF(object);
842         return restuple;
843     }
844     else {
845         wrong_exception_type(exc);
846         return NULL;
847     }
848 }
849 
PyCodec_BackslashReplaceErrors(PyObject * exc)850 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851 {
852     PyObject *object;
853     Py_ssize_t i;
854     Py_ssize_t start;
855     Py_ssize_t end;
856     PyObject *res;
857     Py_UCS1 *outp;
858     int ressize;
859     Py_UCS4 c;
860 
861     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
862         const unsigned char *p;
863         if (PyUnicodeDecodeError_GetStart(exc, &start))
864             return NULL;
865         if (PyUnicodeDecodeError_GetEnd(exc, &end))
866             return NULL;
867         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868             return NULL;
869         p = (const unsigned char*)PyBytes_AS_STRING(object);
870         res = PyUnicode_New(4 * (end - start), 127);
871         if (res == NULL) {
872             Py_DECREF(object);
873             return NULL;
874         }
875         outp = PyUnicode_1BYTE_DATA(res);
876         for (i = start; i < end; i++, outp += 4) {
877             unsigned char c = p[i];
878             outp[0] = '\\';
879             outp[1] = 'x';
880             outp[2] = Py_hexdigits[(c>>4)&0xf];
881             outp[3] = Py_hexdigits[c&0xf];
882         }
883 
884         assert(_PyUnicode_CheckConsistency(res, 1));
885         Py_DECREF(object);
886         return Py_BuildValue("(Nn)", res, end);
887     }
888     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
889         if (PyUnicodeEncodeError_GetStart(exc, &start))
890             return NULL;
891         if (PyUnicodeEncodeError_GetEnd(exc, &end))
892             return NULL;
893         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894             return NULL;
895     }
896     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
897         if (PyUnicodeTranslateError_GetStart(exc, &start))
898             return NULL;
899         if (PyUnicodeTranslateError_GetEnd(exc, &end))
900             return NULL;
901         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902             return NULL;
903     }
904     else {
905         wrong_exception_type(exc);
906         return NULL;
907     }
908 
909     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910         end = start + PY_SSIZE_T_MAX / (1+1+8);
911     for (i = start, ressize = 0; i < end; ++i) {
912         /* object is guaranteed to be "ready" */
913         c = PyUnicode_READ_CHAR(object, i);
914         if (c >= 0x10000) {
915             ressize += 1+1+8;
916         }
917         else if (c >= 0x100) {
918             ressize += 1+1+4;
919         }
920         else
921             ressize += 1+1+2;
922     }
923     res = PyUnicode_New(ressize, 127);
924     if (res == NULL) {
925         Py_DECREF(object);
926         return NULL;
927     }
928     outp = PyUnicode_1BYTE_DATA(res);
929     for (i = start; i < end; ++i) {
930         c = PyUnicode_READ_CHAR(object, i);
931         *outp++ = '\\';
932         if (c >= 0x00010000) {
933             *outp++ = 'U';
934             *outp++ = Py_hexdigits[(c>>28)&0xf];
935             *outp++ = Py_hexdigits[(c>>24)&0xf];
936             *outp++ = Py_hexdigits[(c>>20)&0xf];
937             *outp++ = Py_hexdigits[(c>>16)&0xf];
938             *outp++ = Py_hexdigits[(c>>12)&0xf];
939             *outp++ = Py_hexdigits[(c>>8)&0xf];
940         }
941         else if (c >= 0x100) {
942             *outp++ = 'u';
943             *outp++ = Py_hexdigits[(c>>12)&0xf];
944             *outp++ = Py_hexdigits[(c>>8)&0xf];
945         }
946         else
947             *outp++ = 'x';
948         *outp++ = Py_hexdigits[(c>>4)&0xf];
949         *outp++ = Py_hexdigits[c&0xf];
950     }
951 
952     assert(_PyUnicode_CheckConsistency(res, 1));
953     Py_DECREF(object);
954     return Py_BuildValue("(Nn)", res, end);
955 }
956 
957 static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
958 
PyCodec_NameReplaceErrors(PyObject * exc)959 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960 {
961     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
962         PyObject *restuple;
963         PyObject *object;
964         Py_ssize_t i;
965         Py_ssize_t start;
966         Py_ssize_t end;
967         PyObject *res;
968         Py_UCS1 *outp;
969         Py_ssize_t ressize;
970         int replsize;
971         Py_UCS4 c;
972         char buffer[256]; /* NAME_MAXLEN */
973         if (PyUnicodeEncodeError_GetStart(exc, &start))
974             return NULL;
975         if (PyUnicodeEncodeError_GetEnd(exc, &end))
976             return NULL;
977         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978             return NULL;
979         if (!ucnhash_capi) {
980             /* load the unicode data module */
981             ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982                                             PyUnicodeData_CAPSULE_NAME, 1);
983             if (!ucnhash_capi) {
984                 return NULL;
985             }
986         }
987         for (i = start, ressize = 0; i < end; ++i) {
988             /* object is guaranteed to be "ready" */
989             c = PyUnicode_READ_CHAR(object, i);
990             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
991                 replsize = 1+1+1+(int)strlen(buffer)+1;
992             }
993             else if (c >= 0x10000) {
994                 replsize = 1+1+8;
995             }
996             else if (c >= 0x100) {
997                 replsize = 1+1+4;
998             }
999             else
1000                 replsize = 1+1+2;
1001             if (ressize > PY_SSIZE_T_MAX - replsize)
1002                 break;
1003             ressize += replsize;
1004         }
1005         end = i;
1006         res = PyUnicode_New(ressize, 127);
1007         if (res==NULL)
1008             return NULL;
1009         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010             i < end; ++i) {
1011             c = PyUnicode_READ_CHAR(object, i);
1012             *outp++ = '\\';
1013             if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
1014                 *outp++ = 'N';
1015                 *outp++ = '{';
1016                 strcpy((char *)outp, buffer);
1017                 outp += strlen(buffer);
1018                 *outp++ = '}';
1019                 continue;
1020             }
1021             if (c >= 0x00010000) {
1022                 *outp++ = 'U';
1023                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029             }
1030             else if (c >= 0x100) {
1031                 *outp++ = 'u';
1032                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034             }
1035             else
1036                 *outp++ = 'x';
1037             *outp++ = Py_hexdigits[(c>>4)&0xf];
1038             *outp++ = Py_hexdigits[c&0xf];
1039         }
1040 
1041         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1042         assert(_PyUnicode_CheckConsistency(res, 1));
1043         restuple = Py_BuildValue("(Nn)", res, end);
1044         Py_DECREF(object);
1045         return restuple;
1046     }
1047     else {
1048         wrong_exception_type(exc);
1049         return NULL;
1050     }
1051 }
1052 
1053 #define ENC_UNKNOWN     -1
1054 #define ENC_UTF8        0
1055 #define ENC_UTF16BE     1
1056 #define ENC_UTF16LE     2
1057 #define ENC_UTF32BE     3
1058 #define ENC_UTF32LE     4
1059 
1060 static int
get_standard_encoding(const char * encoding,int * bytelength)1061 get_standard_encoding(const char *encoding, int *bytelength)
1062 {
1063     if (Py_TOLOWER(encoding[0]) == 'u' &&
1064         Py_TOLOWER(encoding[1]) == 't' &&
1065         Py_TOLOWER(encoding[2]) == 'f') {
1066         encoding += 3;
1067         if (*encoding == '-' || *encoding == '_' )
1068             encoding++;
1069         if (encoding[0] == '8' && encoding[1] == '\0') {
1070             *bytelength = 3;
1071             return ENC_UTF8;
1072         }
1073         else if (encoding[0] == '1' && encoding[1] == '6') {
1074             encoding += 2;
1075             *bytelength = 2;
1076             if (*encoding == '\0') {
1077 #ifdef WORDS_BIGENDIAN
1078                 return ENC_UTF16BE;
1079 #else
1080                 return ENC_UTF16LE;
1081 #endif
1082             }
1083             if (*encoding == '-' || *encoding == '_' )
1084                 encoding++;
1085             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086                 if (Py_TOLOWER(encoding[0]) == 'b')
1087                     return ENC_UTF16BE;
1088                 if (Py_TOLOWER(encoding[0]) == 'l')
1089                     return ENC_UTF16LE;
1090             }
1091         }
1092         else if (encoding[0] == '3' && encoding[1] == '2') {
1093             encoding += 2;
1094             *bytelength = 4;
1095             if (*encoding == '\0') {
1096 #ifdef WORDS_BIGENDIAN
1097                 return ENC_UTF32BE;
1098 #else
1099                 return ENC_UTF32LE;
1100 #endif
1101             }
1102             if (*encoding == '-' || *encoding == '_' )
1103                 encoding++;
1104             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105                 if (Py_TOLOWER(encoding[0]) == 'b')
1106                     return ENC_UTF32BE;
1107                 if (Py_TOLOWER(encoding[0]) == 'l')
1108                     return ENC_UTF32LE;
1109             }
1110         }
1111     }
1112     else if (strcmp(encoding, "CP_UTF8") == 0) {
1113         *bytelength = 3;
1114         return ENC_UTF8;
1115     }
1116     return ENC_UNKNOWN;
1117 }
1118 
1119 /* This handler is declared static until someone demonstrates
1120    a need to call it directly. */
1121 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1122 PyCodec_SurrogatePassErrors(PyObject *exc)
1123 {
1124     PyObject *restuple;
1125     PyObject *object;
1126     PyObject *encode;
1127     const char *encoding;
1128     int code;
1129     int bytelength;
1130     Py_ssize_t i;
1131     Py_ssize_t start;
1132     Py_ssize_t end;
1133     PyObject *res;
1134 
1135     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1136         unsigned char *outp;
1137         if (PyUnicodeEncodeError_GetStart(exc, &start))
1138             return NULL;
1139         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140             return NULL;
1141         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142             return NULL;
1143         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144             Py_DECREF(object);
1145             return NULL;
1146         }
1147         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148             Py_DECREF(object);
1149             Py_DECREF(encode);
1150             return NULL;
1151         }
1152         code = get_standard_encoding(encoding, &bytelength);
1153         Py_DECREF(encode);
1154         if (code == ENC_UNKNOWN) {
1155             /* Not supported, fail with original exception */
1156             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157             Py_DECREF(object);
1158             return NULL;
1159         }
1160 
1161         if (end - start > PY_SSIZE_T_MAX / bytelength)
1162             end = start + PY_SSIZE_T_MAX / bytelength;
1163         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1164         if (!res) {
1165             Py_DECREF(object);
1166             return NULL;
1167         }
1168         outp = (unsigned char*)PyBytes_AsString(res);
1169         for (i = start; i < end; i++) {
1170             /* object is guaranteed to be "ready" */
1171             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1172             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1173                 /* Not a surrogate, fail with original exception */
1174                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175                 Py_DECREF(res);
1176                 Py_DECREF(object);
1177                 return NULL;
1178             }
1179             switch (code) {
1180             case ENC_UTF8:
1181                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184                 break;
1185             case ENC_UTF16LE:
1186                 *outp++ = (unsigned char) ch;
1187                 *outp++ = (unsigned char)(ch >> 8);
1188                 break;
1189             case ENC_UTF16BE:
1190                 *outp++ = (unsigned char)(ch >> 8);
1191                 *outp++ = (unsigned char) ch;
1192                 break;
1193             case ENC_UTF32LE:
1194                 *outp++ = (unsigned char) ch;
1195                 *outp++ = (unsigned char)(ch >> 8);
1196                 *outp++ = (unsigned char)(ch >> 16);
1197                 *outp++ = (unsigned char)(ch >> 24);
1198                 break;
1199             case ENC_UTF32BE:
1200                 *outp++ = (unsigned char)(ch >> 24);
1201                 *outp++ = (unsigned char)(ch >> 16);
1202                 *outp++ = (unsigned char)(ch >> 8);
1203                 *outp++ = (unsigned char) ch;
1204                 break;
1205             }
1206         }
1207         restuple = Py_BuildValue("(On)", res, end);
1208         Py_DECREF(res);
1209         Py_DECREF(object);
1210         return restuple;
1211     }
1212     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1213         const unsigned char *p;
1214         Py_UCS4 ch = 0;
1215         if (PyUnicodeDecodeError_GetStart(exc, &start))
1216             return NULL;
1217         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218             return NULL;
1219         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220             return NULL;
1221         p = (const unsigned char*)PyBytes_AS_STRING(object);
1222         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223             Py_DECREF(object);
1224             return NULL;
1225         }
1226         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227             Py_DECREF(object);
1228             Py_DECREF(encode);
1229             return NULL;
1230         }
1231         code = get_standard_encoding(encoding, &bytelength);
1232         Py_DECREF(encode);
1233         if (code == ENC_UNKNOWN) {
1234             /* Not supported, fail with original exception */
1235             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236             Py_DECREF(object);
1237             return NULL;
1238         }
1239 
1240         /* Try decoding a single surrogate character. If
1241            there are more, let the codec call us again. */
1242         p += start;
1243         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244             switch (code) {
1245             case ENC_UTF8:
1246                 if ((p[0] & 0xf0) == 0xe0 &&
1247                     (p[1] & 0xc0) == 0x80 &&
1248                     (p[2] & 0xc0) == 0x80) {
1249                     /* it's a three-byte code */
1250                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251                 }
1252                 break;
1253             case ENC_UTF16LE:
1254                 ch = p[1] << 8 | p[0];
1255                 break;
1256             case ENC_UTF16BE:
1257                 ch = p[0] << 8 | p[1];
1258                 break;
1259             case ENC_UTF32LE:
1260                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261                 break;
1262             case ENC_UTF32BE:
1263                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264                 break;
1265             }
1266         }
1267 
1268         Py_DECREF(object);
1269         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270             /* it's not a surrogate - fail */
1271             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272             return NULL;
1273         }
1274         res = PyUnicode_FromOrdinal(ch);
1275         if (res == NULL)
1276             return NULL;
1277         return Py_BuildValue("(Nn)", res, start + bytelength);
1278     }
1279     else {
1280         wrong_exception_type(exc);
1281         return NULL;
1282     }
1283 }
1284 
1285 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1286 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1287 {
1288     PyObject *restuple;
1289     PyObject *object;
1290     Py_ssize_t i;
1291     Py_ssize_t start;
1292     Py_ssize_t end;
1293     PyObject *res;
1294 
1295     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1296         char *outp;
1297         if (PyUnicodeEncodeError_GetStart(exc, &start))
1298             return NULL;
1299         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300             return NULL;
1301         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302             return NULL;
1303         res = PyBytes_FromStringAndSize(NULL, end-start);
1304         if (!res) {
1305             Py_DECREF(object);
1306             return NULL;
1307         }
1308         outp = PyBytes_AsString(res);
1309         for (i = start; i < end; i++) {
1310             /* object is guaranteed to be "ready" */
1311             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1312             if (ch < 0xdc80 || ch > 0xdcff) {
1313                 /* Not a UTF-8b surrogate, fail with original exception */
1314                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315                 Py_DECREF(res);
1316                 Py_DECREF(object);
1317                 return NULL;
1318             }
1319             *outp++ = ch - 0xdc00;
1320         }
1321         restuple = Py_BuildValue("(On)", res, end);
1322         Py_DECREF(res);
1323         Py_DECREF(object);
1324         return restuple;
1325     }
1326     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1327         PyObject *str;
1328         const unsigned char *p;
1329         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1330         int consumed = 0;
1331         if (PyUnicodeDecodeError_GetStart(exc, &start))
1332             return NULL;
1333         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334             return NULL;
1335         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336             return NULL;
1337         p = (const unsigned char*)PyBytes_AS_STRING(object);
1338         while (consumed < 4 && consumed < end-start) {
1339             /* Refuse to escape ASCII bytes. */
1340             if (p[start+consumed] < 128)
1341                 break;
1342             ch[consumed] = 0xdc00 + p[start+consumed];
1343             consumed++;
1344         }
1345         Py_DECREF(object);
1346         if (!consumed) {
1347             /* codec complained about ASCII byte. */
1348             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349             return NULL;
1350         }
1351         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352         if (str == NULL)
1353             return NULL;
1354         return Py_BuildValue("(Nn)", str, start+consumed);
1355     }
1356     else {
1357         wrong_exception_type(exc);
1358         return NULL;
1359     }
1360 }
1361 
1362 
strict_errors(PyObject * self,PyObject * exc)1363 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364 {
1365     return PyCodec_StrictErrors(exc);
1366 }
1367 
1368 
ignore_errors(PyObject * self,PyObject * exc)1369 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370 {
1371     return PyCodec_IgnoreErrors(exc);
1372 }
1373 
1374 
replace_errors(PyObject * self,PyObject * exc)1375 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376 {
1377     return PyCodec_ReplaceErrors(exc);
1378 }
1379 
1380 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1381 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382 {
1383     return PyCodec_XMLCharRefReplaceErrors(exc);
1384 }
1385 
1386 
backslashreplace_errors(PyObject * self,PyObject * exc)1387 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388 {
1389     return PyCodec_BackslashReplaceErrors(exc);
1390 }
1391 
namereplace_errors(PyObject * self,PyObject * exc)1392 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393 {
1394     return PyCodec_NameReplaceErrors(exc);
1395 }
1396 
surrogatepass_errors(PyObject * self,PyObject * exc)1397 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1398 {
1399     return PyCodec_SurrogatePassErrors(exc);
1400 }
1401 
surrogateescape_errors(PyObject * self,PyObject * exc)1402 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1403 {
1404     return PyCodec_SurrogateEscapeErrors(exc);
1405 }
1406 
_PyCodecRegistry_Init(void)1407 static int _PyCodecRegistry_Init(void)
1408 {
1409     static struct {
1410         const char *name;
1411         PyMethodDef def;
1412     } methods[] =
1413     {
1414         {
1415             "strict",
1416             {
1417                 "strict_errors",
1418                 strict_errors,
1419                 METH_O,
1420                 PyDoc_STR("Implements the 'strict' error handling, which "
1421                           "raises a UnicodeError on coding errors.")
1422             }
1423         },
1424         {
1425             "ignore",
1426             {
1427                 "ignore_errors",
1428                 ignore_errors,
1429                 METH_O,
1430                 PyDoc_STR("Implements the 'ignore' error handling, which "
1431                           "ignores malformed data and continues.")
1432             }
1433         },
1434         {
1435             "replace",
1436             {
1437                 "replace_errors",
1438                 replace_errors,
1439                 METH_O,
1440                 PyDoc_STR("Implements the 'replace' error handling, which "
1441                           "replaces malformed data with a replacement marker.")
1442             }
1443         },
1444         {
1445             "xmlcharrefreplace",
1446             {
1447                 "xmlcharrefreplace_errors",
1448                 xmlcharrefreplace_errors,
1449                 METH_O,
1450                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451                           "which replaces an unencodable character with the "
1452                           "appropriate XML character reference.")
1453             }
1454         },
1455         {
1456             "backslashreplace",
1457             {
1458                 "backslashreplace_errors",
1459                 backslashreplace_errors,
1460                 METH_O,
1461                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1462                           "which replaces malformed data with a backslashed "
1463                           "escape sequence.")
1464             }
1465         },
1466         {
1467             "namereplace",
1468             {
1469                 "namereplace_errors",
1470                 namereplace_errors,
1471                 METH_O,
1472                 PyDoc_STR("Implements the 'namereplace' error handling, "
1473                           "which replaces an unencodable character with a "
1474                           "\\N{...} escape sequence.")
1475             }
1476         },
1477         {
1478             "surrogatepass",
1479             {
1480                 "surrogatepass",
1481                 surrogatepass_errors,
1482                 METH_O
1483             }
1484         },
1485         {
1486             "surrogateescape",
1487             {
1488                 "surrogateescape",
1489                 surrogateescape_errors,
1490                 METH_O
1491             }
1492         }
1493     };
1494 
1495     PyInterpreterState *interp = _PyInterpreterState_GET();
1496     PyObject *mod;
1497 
1498     if (interp->codec_search_path != NULL)
1499         return 0;
1500 
1501     interp->codec_search_path = PyList_New(0);
1502     if (interp->codec_search_path == NULL) {
1503         return -1;
1504     }
1505 
1506     interp->codec_search_cache = PyDict_New();
1507     if (interp->codec_search_cache == NULL) {
1508         return -1;
1509     }
1510 
1511     interp->codec_error_registry = PyDict_New();
1512     if (interp->codec_error_registry == NULL) {
1513         return -1;
1514     }
1515 
1516     for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1517         PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518         if (!func) {
1519             return -1;
1520         }
1521 
1522         int res = PyCodec_RegisterError(methods[i].name, func);
1523         Py_DECREF(func);
1524         if (res) {
1525             return -1;
1526         }
1527     }
1528 
1529     mod = PyImport_ImportModule("encodings");
1530     if (mod == NULL) {
1531         return -1;
1532     }
1533     Py_DECREF(mod);
1534     interp->codecs_initialized = 1;
1535     return 0;
1536 }
1537