1 /* stringlib: codec implementations */
2 
3 #if !STRINGLIB_IS_UNICODE
4 # error "codecs.h is specific to Unicode"
5 #endif
6 
7 #include "pycore_bitutils.h"      // _Py_bswap32()
8 
9 /* Mask to quickly check whether a C 'size_t' contains a
10    non-ASCII, UTF8-encoded char. */
11 #if (SIZEOF_SIZE_T == 8)
12 # define ASCII_CHAR_MASK 0x8080808080808080ULL
13 #elif (SIZEOF_SIZE_T == 4)
14 # define ASCII_CHAR_MASK 0x80808080U
15 #else
16 # error C 'size_t' size should be either 4 or 8!
17 #endif
18 
19 /* 10xxxxxx */
20 #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
21 
22 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)23 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
24                        STRINGLIB_CHAR *dest,
25                        Py_ssize_t *outpos)
26 {
27     Py_UCS4 ch;
28     const char *s = *inptr;
29     STRINGLIB_CHAR *p = dest + *outpos;
30 
31     while (s < end) {
32         ch = (unsigned char)*s;
33 
34         if (ch < 0x80) {
35             /* Fast path for runs of ASCII characters. Given that common UTF-8
36                input will consist of an overwhelming majority of ASCII
37                characters, we try to optimize for this case by checking
38                as many characters as a C 'size_t' can contain.
39                First, check if we can do an aligned read, as most CPUs have
40                a penalty for unaligned reads.
41             */
42             if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
43                 /* Help register allocation */
44                 const char *_s = s;
45                 STRINGLIB_CHAR *_p = p;
46                 while (_s + SIZEOF_SIZE_T <= end) {
47                     /* Read a whole size_t at a time (either 4 or 8 bytes),
48                        and do a fast unrolled copy if it only contains ASCII
49                        characters. */
50                     size_t value = *(const size_t *) _s;
51                     if (value & ASCII_CHAR_MASK)
52                         break;
53 #if PY_LITTLE_ENDIAN
54                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
55                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
56                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
57                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
58 # if SIZEOF_SIZE_T == 8
59                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
60                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
61                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
62                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
63 # endif
64 #else
65 # if SIZEOF_SIZE_T == 8
66                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
67                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
68                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
69                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
70                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
71                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
72                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
73                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
74 # else
75                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
76                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
77                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
78                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
79 # endif
80 #endif
81                     _s += SIZEOF_SIZE_T;
82                     _p += SIZEOF_SIZE_T;
83                 }
84                 s = _s;
85                 p = _p;
86                 if (s == end)
87                     break;
88                 ch = (unsigned char)*s;
89             }
90             if (ch < 0x80) {
91                 s++;
92                 *p++ = ch;
93                 continue;
94             }
95         }
96 
97         if (ch < 0xE0) {
98             /* \xC2\x80-\xDF\xBF -- 0080-07FF */
99             Py_UCS4 ch2;
100             if (ch < 0xC2) {
101                 /* invalid sequence
102                 \x80-\xBF -- continuation byte
103                 \xC0-\xC1 -- fake 0000-007F */
104                 goto InvalidStart;
105             }
106             if (end - s < 2) {
107                 /* unexpected end of data: the caller will decide whether
108                    it's an error or not */
109                 break;
110             }
111             ch2 = (unsigned char)s[1];
112             if (!IS_CONTINUATION_BYTE(ch2))
113                 /* invalid continuation byte */
114                 goto InvalidContinuation1;
115             ch = (ch << 6) + ch2 -
116                  ((0xC0 << 6) + 0x80);
117             assert ((ch > 0x007F) && (ch <= 0x07FF));
118             s += 2;
119             if (STRINGLIB_MAX_CHAR <= 0x007F ||
120                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
121                 /* Out-of-range */
122                 goto Return;
123             *p++ = ch;
124             continue;
125         }
126 
127         if (ch < 0xF0) {
128             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
129             Py_UCS4 ch2, ch3;
130             if (end - s < 3) {
131                 /* unexpected end of data: the caller will decide whether
132                    it's an error or not */
133                 if (end - s < 2)
134                     break;
135                 ch2 = (unsigned char)s[1];
136                 if (!IS_CONTINUATION_BYTE(ch2) ||
137                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
138                     /* for clarification see comments below */
139                     goto InvalidContinuation1;
140                 break;
141             }
142             ch2 = (unsigned char)s[1];
143             ch3 = (unsigned char)s[2];
144             if (!IS_CONTINUATION_BYTE(ch2)) {
145                 /* invalid continuation byte */
146                 goto InvalidContinuation1;
147             }
148             if (ch == 0xE0) {
149                 if (ch2 < 0xA0)
150                     /* invalid sequence
151                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
152                     goto InvalidContinuation1;
153             } else if (ch == 0xED && ch2 >= 0xA0) {
154                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
155                    will result in surrogates in range D800-DFFF. Surrogates are
156                    not valid UTF-8 so they are rejected.
157                    See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
158                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
159                 goto InvalidContinuation1;
160             }
161             if (!IS_CONTINUATION_BYTE(ch3)) {
162                 /* invalid continuation byte */
163                 goto InvalidContinuation2;
164             }
165             ch = (ch << 12) + (ch2 << 6) + ch3 -
166                  ((0xE0 << 12) + (0x80 << 6) + 0x80);
167             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
168             s += 3;
169             if (STRINGLIB_MAX_CHAR <= 0x07FF ||
170                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
171                 /* Out-of-range */
172                 goto Return;
173             *p++ = ch;
174             continue;
175         }
176 
177         if (ch < 0xF5) {
178             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
179             Py_UCS4 ch2, ch3, ch4;
180             if (end - s < 4) {
181                 /* unexpected end of data: the caller will decide whether
182                    it's an error or not */
183                 if (end - s < 2)
184                     break;
185                 ch2 = (unsigned char)s[1];
186                 if (!IS_CONTINUATION_BYTE(ch2) ||
187                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
188                     /* for clarification see comments below */
189                     goto InvalidContinuation1;
190                 if (end - s < 3)
191                     break;
192                 ch3 = (unsigned char)s[2];
193                 if (!IS_CONTINUATION_BYTE(ch3))
194                     goto InvalidContinuation2;
195                 break;
196             }
197             ch2 = (unsigned char)s[1];
198             ch3 = (unsigned char)s[2];
199             ch4 = (unsigned char)s[3];
200             if (!IS_CONTINUATION_BYTE(ch2)) {
201                 /* invalid continuation byte */
202                 goto InvalidContinuation1;
203             }
204             if (ch == 0xF0) {
205                 if (ch2 < 0x90)
206                     /* invalid sequence
207                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
208                     goto InvalidContinuation1;
209             } else if (ch == 0xF4 && ch2 >= 0x90) {
210                 /* invalid sequence
211                    \xF4\x90\x80\x80- -- 110000- overflow */
212                 goto InvalidContinuation1;
213             }
214             if (!IS_CONTINUATION_BYTE(ch3)) {
215                 /* invalid continuation byte */
216                 goto InvalidContinuation2;
217             }
218             if (!IS_CONTINUATION_BYTE(ch4)) {
219                 /* invalid continuation byte */
220                 goto InvalidContinuation3;
221             }
222             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
223                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
224             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
225             s += 4;
226             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
227                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
228                 /* Out-of-range */
229                 goto Return;
230             *p++ = ch;
231             continue;
232         }
233         goto InvalidStart;
234     }
235     ch = 0;
236 Return:
237     *inptr = s;
238     *outpos = p - dest;
239     return ch;
240 InvalidStart:
241     ch = 1;
242     goto Return;
243 InvalidContinuation1:
244     ch = 2;
245     goto Return;
246 InvalidContinuation2:
247     ch = 3;
248     goto Return;
249 InvalidContinuation3:
250     ch = 4;
251     goto Return;
252 }
253 
254 #undef ASCII_CHAR_MASK
255 
256 
257 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
258    PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
259    UCS-1 strings don't need to handle surrogates for example. */
260 Py_LOCAL_INLINE(char *)
STRINGLIB(utf8_encoder)261 STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
262                         PyObject *unicode,
263                         const STRINGLIB_CHAR *data,
264                         Py_ssize_t size,
265                         _Py_error_handler error_handler,
266                         const char *errors)
267 {
268     Py_ssize_t i;                /* index into data of next input character */
269     char *p;                     /* next free byte in output buffer */
270 #if STRINGLIB_SIZEOF_CHAR > 1
271     PyObject *error_handler_obj = NULL;
272     PyObject *exc = NULL;
273     PyObject *rep = NULL;
274 #endif
275 #if STRINGLIB_SIZEOF_CHAR == 1
276     const Py_ssize_t max_char_size = 2;
277 #elif STRINGLIB_SIZEOF_CHAR == 2
278     const Py_ssize_t max_char_size = 3;
279 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
280     const Py_ssize_t max_char_size = 4;
281 #endif
282 
283     assert(size >= 0);
284     if (size > PY_SSIZE_T_MAX / max_char_size) {
285         /* integer overflow */
286         PyErr_NoMemory();
287         return NULL;
288     }
289 
290     _PyBytesWriter_Init(writer);
291     p = _PyBytesWriter_Alloc(writer, size * max_char_size);
292     if (p == NULL)
293         return NULL;
294 
295     for (i = 0; i < size;) {
296         Py_UCS4 ch = data[i++];
297 
298         if (ch < 0x80) {
299             /* Encode ASCII */
300             *p++ = (char) ch;
301 
302         }
303         else
304 #if STRINGLIB_SIZEOF_CHAR > 1
305         if (ch < 0x0800)
306 #endif
307         {
308             /* Encode Latin-1 */
309             *p++ = (char)(0xc0 | (ch >> 6));
310             *p++ = (char)(0x80 | (ch & 0x3f));
311         }
312 #if STRINGLIB_SIZEOF_CHAR > 1
313         else if (Py_UNICODE_IS_SURROGATE(ch)) {
314             Py_ssize_t startpos, endpos, newpos;
315             Py_ssize_t k;
316             if (error_handler == _Py_ERROR_UNKNOWN) {
317                 error_handler = _Py_GetErrorHandler(errors);
318             }
319 
320             startpos = i-1;
321             endpos = startpos+1;
322 
323             while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
324                 endpos++;
325 
326             /* Only overallocate the buffer if it's not the last write */
327             writer->overallocate = (endpos < size);
328 
329             switch (error_handler)
330             {
331             case _Py_ERROR_REPLACE:
332                 memset(p, '?', endpos - startpos);
333                 p += (endpos - startpos);
334                 /* fall through */
335             case _Py_ERROR_IGNORE:
336                 i += (endpos - startpos - 1);
337                 break;
338 
339             case _Py_ERROR_SURROGATEPASS:
340                 for (k=startpos; k<endpos; k++) {
341                     ch = data[k];
342                     *p++ = (char)(0xe0 | (ch >> 12));
343                     *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
344                     *p++ = (char)(0x80 | (ch & 0x3f));
345                 }
346                 i += (endpos - startpos - 1);
347                 break;
348 
349             case _Py_ERROR_BACKSLASHREPLACE:
350                 /* subtract preallocated bytes */
351                 writer->min_size -= max_char_size * (endpos - startpos);
352                 p = backslashreplace(writer, p,
353                                      unicode, startpos, endpos);
354                 if (p == NULL)
355                     goto error;
356                 i += (endpos - startpos - 1);
357                 break;
358 
359             case _Py_ERROR_XMLCHARREFREPLACE:
360                 /* subtract preallocated bytes */
361                 writer->min_size -= max_char_size * (endpos - startpos);
362                 p = xmlcharrefreplace(writer, p,
363                                       unicode, startpos, endpos);
364                 if (p == NULL)
365                     goto error;
366                 i += (endpos - startpos - 1);
367                 break;
368 
369             case _Py_ERROR_SURROGATEESCAPE:
370                 for (k=startpos; k<endpos; k++) {
371                     ch = data[k];
372                     if (!(0xDC80 <= ch && ch <= 0xDCFF))
373                         break;
374                     *p++ = (char)(ch & 0xff);
375                 }
376                 if (k >= endpos) {
377                     i += (endpos - startpos - 1);
378                     break;
379                 }
380                 startpos = k;
381                 assert(startpos < endpos);
382                 /* fall through */
383             default:
384                 rep = unicode_encode_call_errorhandler(
385                       errors, &error_handler_obj, "utf-8", "surrogates not allowed",
386                       unicode, &exc, startpos, endpos, &newpos);
387                 if (!rep)
388                     goto error;
389 
390                 if (newpos < startpos) {
391                     writer->overallocate = 1;
392                     p = _PyBytesWriter_Prepare(writer, p,
393                                                max_char_size * (startpos - newpos));
394                     if (p == NULL)
395                         goto error;
396                 }
397                 else {
398                     /* subtract preallocated bytes */
399                     writer->min_size -= max_char_size * (newpos - startpos);
400                     /* Only overallocate the buffer if it's not the last write */
401                     writer->overallocate = (newpos < size);
402                 }
403 
404                 if (PyBytes_Check(rep)) {
405                     p = _PyBytesWriter_WriteBytes(writer, p,
406                                                   PyBytes_AS_STRING(rep),
407                                                   PyBytes_GET_SIZE(rep));
408                 }
409                 else {
410                     /* rep is unicode */
411                     if (PyUnicode_READY(rep) < 0)
412                         goto error;
413 
414                     if (!PyUnicode_IS_ASCII(rep)) {
415                         raise_encode_exception(&exc, "utf-8", unicode,
416                                                startpos, endpos,
417                                                "surrogates not allowed");
418                         goto error;
419                     }
420 
421                     p = _PyBytesWriter_WriteBytes(writer, p,
422                                                   PyUnicode_DATA(rep),
423                                                   PyUnicode_GET_LENGTH(rep));
424                 }
425 
426                 if (p == NULL)
427                     goto error;
428                 Py_CLEAR(rep);
429 
430                 i = newpos;
431             }
432 
433             /* If overallocation was disabled, ensure that it was the last
434                write. Otherwise, we missed an optimization */
435             assert(writer->overallocate || i == size);
436         }
437         else
438 #if STRINGLIB_SIZEOF_CHAR > 2
439         if (ch < 0x10000)
440 #endif
441         {
442             *p++ = (char)(0xe0 | (ch >> 12));
443             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
444             *p++ = (char)(0x80 | (ch & 0x3f));
445         }
446 #if STRINGLIB_SIZEOF_CHAR > 2
447         else /* ch >= 0x10000 */
448         {
449             assert(ch <= MAX_UNICODE);
450             /* Encode UCS4 Unicode ordinals */
451             *p++ = (char)(0xf0 | (ch >> 18));
452             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
453             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
454             *p++ = (char)(0x80 | (ch & 0x3f));
455         }
456 #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
457 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
458     }
459 
460 #if STRINGLIB_SIZEOF_CHAR > 1
461     Py_XDECREF(error_handler_obj);
462     Py_XDECREF(exc);
463 #endif
464     return p;
465 
466 #if STRINGLIB_SIZEOF_CHAR > 1
467  error:
468     Py_XDECREF(rep);
469     Py_XDECREF(error_handler_obj);
470     Py_XDECREF(exc);
471     return NULL;
472 #endif
473 }
474 
475 /* The pattern for constructing UCS2-repeated masks. */
476 #if SIZEOF_LONG == 8
477 # define UCS2_REPEAT_MASK 0x0001000100010001ul
478 #elif SIZEOF_LONG == 4
479 # define UCS2_REPEAT_MASK 0x00010001ul
480 #else
481 # error C 'long' size should be either 4 or 8!
482 #endif
483 
484 /* The mask for fast checking. */
485 #if STRINGLIB_SIZEOF_CHAR == 1
486 /* The mask for fast checking of whether a C 'long' contains a
487    non-ASCII or non-Latin1 UTF16-encoded characters. */
488 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
489 #else
490 /* The mask for fast checking of whether a C 'long' may contain
491    UTF16-encoded surrogate characters. This is an efficient heuristic,
492    assuming that non-surrogate characters with a code point >= 0x8000 are
493    rare in most input.
494 */
495 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
496 #endif
497 /* The mask for fast byte-swapping. */
498 #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
499 /* Swap bytes. */
500 #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
501                                  (((value) & STRIPPED_MASK) << 8))
502 
503 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)504 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
505                         STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
506                         int native_ordering)
507 {
508     Py_UCS4 ch;
509     const unsigned char *q = *inptr;
510     STRINGLIB_CHAR *p = dest + *outpos;
511     /* Offsets from q for retrieving byte pairs in the right order. */
512 #if PY_LITTLE_ENDIAN
513     int ihi = !!native_ordering, ilo = !native_ordering;
514 #else
515     int ihi = !native_ordering, ilo = !!native_ordering;
516 #endif
517     --e;
518 
519     while (q < e) {
520         Py_UCS4 ch2;
521         /* First check for possible aligned read of a C 'long'. Unaligned
522            reads are more expensive, better to defer to another iteration. */
523         if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
524             /* Fast path for runs of in-range non-surrogate chars. */
525             const unsigned char *_q = q;
526             while (_q + SIZEOF_LONG <= e) {
527                 unsigned long block = * (const unsigned long *) _q;
528                 if (native_ordering) {
529                     /* Can use buffer directly */
530                     if (block & FAST_CHAR_MASK)
531                         break;
532                 }
533                 else {
534                     /* Need to byte-swap */
535                     if (block & SWAB(FAST_CHAR_MASK))
536                         break;
537 #if STRINGLIB_SIZEOF_CHAR == 1
538                     block >>= 8;
539 #else
540                     block = SWAB(block);
541 #endif
542                 }
543 #if PY_LITTLE_ENDIAN
544 # if SIZEOF_LONG == 4
545                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
546                 p[1] = (STRINGLIB_CHAR)(block >> 16);
547 # elif SIZEOF_LONG == 8
548                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
549                 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
550                 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
551                 p[3] = (STRINGLIB_CHAR)(block >> 48);
552 # endif
553 #else
554 # if SIZEOF_LONG == 4
555                 p[0] = (STRINGLIB_CHAR)(block >> 16);
556                 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
557 # elif SIZEOF_LONG == 8
558                 p[0] = (STRINGLIB_CHAR)(block >> 48);
559                 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
560                 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
561                 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
562 # endif
563 #endif
564                 _q += SIZEOF_LONG;
565                 p += SIZEOF_LONG / 2;
566             }
567             q = _q;
568             if (q >= e)
569                 break;
570         }
571 
572         ch = (q[ihi] << 8) | q[ilo];
573         q += 2;
574         if (!Py_UNICODE_IS_SURROGATE(ch)) {
575 #if STRINGLIB_SIZEOF_CHAR < 2
576             if (ch > STRINGLIB_MAX_CHAR)
577                 /* Out-of-range */
578                 goto Return;
579 #endif
580             *p++ = (STRINGLIB_CHAR)ch;
581             continue;
582         }
583 
584         /* UTF-16 code pair: */
585         if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
586             goto IllegalEncoding;
587         if (q >= e)
588             goto UnexpectedEnd;
589         ch2 = (q[ihi] << 8) | q[ilo];
590         q += 2;
591         if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
592             goto IllegalSurrogate;
593         ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
594 #if STRINGLIB_SIZEOF_CHAR < 4
595         /* Out-of-range */
596         goto Return;
597 #else
598         *p++ = (STRINGLIB_CHAR)ch;
599 #endif
600     }
601     ch = 0;
602 Return:
603     *inptr = q;
604     *outpos = p - dest;
605     return ch;
606 UnexpectedEnd:
607     ch = 1;
608     goto Return;
609 IllegalEncoding:
610     ch = 2;
611     goto Return;
612 IllegalSurrogate:
613     ch = 3;
614     goto Return;
615 }
616 #undef UCS2_REPEAT_MASK
617 #undef FAST_CHAR_MASK
618 #undef STRIPPED_MASK
619 #undef SWAB
620 
621 
622 #if STRINGLIB_MAX_CHAR >= 0x80
623 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)624 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
625                         Py_ssize_t len,
626                         unsigned short **outptr,
627                         int native_ordering)
628 {
629     unsigned short *out = *outptr;
630     const STRINGLIB_CHAR *end = in + len;
631 #if STRINGLIB_SIZEOF_CHAR == 1
632     if (native_ordering) {
633         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
634         while (in < unrolled_end) {
635             out[0] = in[0];
636             out[1] = in[1];
637             out[2] = in[2];
638             out[3] = in[3];
639             in += 4; out += 4;
640         }
641         while (in < end) {
642             *out++ = *in++;
643         }
644     } else {
645 # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
646         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
647         while (in < unrolled_end) {
648             out[0] = SWAB2(in[0]);
649             out[1] = SWAB2(in[1]);
650             out[2] = SWAB2(in[2]);
651             out[3] = SWAB2(in[3]);
652             in += 4; out += 4;
653         }
654         while (in < end) {
655             Py_UCS4 ch = *in++;
656             *out++ = SWAB2((Py_UCS2)ch);
657         }
658 #undef SWAB2
659     }
660     *outptr = out;
661     return len;
662 #else
663     if (native_ordering) {
664 #if STRINGLIB_MAX_CHAR < 0x10000
665         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
666         while (in < unrolled_end) {
667             /* check if any character is a surrogate character */
668             if (((in[0] ^ 0xd800) &
669                  (in[1] ^ 0xd800) &
670                  (in[2] ^ 0xd800) &
671                  (in[3] ^ 0xd800) & 0xf800) == 0)
672                 break;
673             out[0] = in[0];
674             out[1] = in[1];
675             out[2] = in[2];
676             out[3] = in[3];
677             in += 4; out += 4;
678         }
679 #endif
680         while (in < end) {
681             Py_UCS4 ch;
682             ch = *in++;
683             if (ch < 0xd800)
684                 *out++ = ch;
685             else if (ch < 0xe000)
686                 /* reject surrogate characters (U+D800-U+DFFF) */
687                 goto fail;
688 #if STRINGLIB_MAX_CHAR >= 0x10000
689             else if (ch >= 0x10000) {
690                 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
691                 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
692                 out += 2;
693             }
694 #endif
695             else
696                 *out++ = ch;
697         }
698     } else {
699 #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
700 #if STRINGLIB_MAX_CHAR < 0x10000
701         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
702         while (in < unrolled_end) {
703             /* check if any character is a surrogate character */
704             if (((in[0] ^ 0xd800) &
705                  (in[1] ^ 0xd800) &
706                  (in[2] ^ 0xd800) &
707                  (in[3] ^ 0xd800) & 0xf800) == 0)
708                 break;
709             out[0] = SWAB2(in[0]);
710             out[1] = SWAB2(in[1]);
711             out[2] = SWAB2(in[2]);
712             out[3] = SWAB2(in[3]);
713             in += 4; out += 4;
714         }
715 #endif
716         while (in < end) {
717             Py_UCS4 ch = *in++;
718             if (ch < 0xd800)
719                 *out++ = SWAB2((Py_UCS2)ch);
720             else if (ch < 0xe000)
721                 /* reject surrogate characters (U+D800-U+DFFF) */
722                 goto fail;
723 #if STRINGLIB_MAX_CHAR >= 0x10000
724             else if (ch >= 0x10000) {
725                 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
726                 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
727                 out[0] = SWAB2(ch1);
728                 out[1] = SWAB2(ch2);
729                 out += 2;
730             }
731 #endif
732             else
733                 *out++ = SWAB2((Py_UCS2)ch);
734         }
735 #undef SWAB2
736     }
737     *outptr = out;
738     return len;
739   fail:
740     *outptr = out;
741     return len - (end - in + 1);
742 #endif
743 }
744 
745 static inline uint32_t
STRINGLIB(SWAB4)746 STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
747 {
748     uint32_t word = ch;
749 #if STRINGLIB_SIZEOF_CHAR == 1
750     /* high bytes are zero */
751     return (word << 24);
752 #elif STRINGLIB_SIZEOF_CHAR == 2
753     /* high bytes are zero */
754     return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
755 #else
756     return _Py_bswap32(word);
757 #endif
758 }
759 
760 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf32_encode)761 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
762                         Py_ssize_t len,
763                         uint32_t **outptr,
764                         int native_ordering)
765 {
766     uint32_t *out = *outptr;
767     const STRINGLIB_CHAR *end = in + len;
768     if (native_ordering) {
769         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
770         while (in < unrolled_end) {
771 #if STRINGLIB_SIZEOF_CHAR > 1
772             /* check if any character is a surrogate character */
773             if (((in[0] ^ 0xd800) &
774                  (in[1] ^ 0xd800) &
775                  (in[2] ^ 0xd800) &
776                  (in[3] ^ 0xd800) & 0xf800) == 0)
777                 break;
778 #endif
779             out[0] = in[0];
780             out[1] = in[1];
781             out[2] = in[2];
782             out[3] = in[3];
783             in += 4; out += 4;
784         }
785         while (in < end) {
786             Py_UCS4 ch;
787             ch = *in++;
788 #if STRINGLIB_SIZEOF_CHAR > 1
789             if (Py_UNICODE_IS_SURROGATE(ch)) {
790                 /* reject surrogate characters (U+D800-U+DFFF) */
791                 goto fail;
792             }
793 #endif
794             *out++ = ch;
795         }
796     } else {
797         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
798         while (in < unrolled_end) {
799 #if STRINGLIB_SIZEOF_CHAR > 1
800             /* check if any character is a surrogate character */
801             if (((in[0] ^ 0xd800) &
802                  (in[1] ^ 0xd800) &
803                  (in[2] ^ 0xd800) &
804                  (in[3] ^ 0xd800) & 0xf800) == 0)
805                 break;
806 #endif
807             out[0] = STRINGLIB(SWAB4)(in[0]);
808             out[1] = STRINGLIB(SWAB4)(in[1]);
809             out[2] = STRINGLIB(SWAB4)(in[2]);
810             out[3] = STRINGLIB(SWAB4)(in[3]);
811             in += 4; out += 4;
812         }
813         while (in < end) {
814             Py_UCS4 ch = *in++;
815 #if STRINGLIB_SIZEOF_CHAR > 1
816             if (Py_UNICODE_IS_SURROGATE(ch)) {
817                 /* reject surrogate characters (U+D800-U+DFFF) */
818                 goto fail;
819             }
820 #endif
821             *out++ = STRINGLIB(SWAB4)(ch);
822         }
823     }
824     *outptr = out;
825     return len;
826 #if STRINGLIB_SIZEOF_CHAR > 1
827   fail:
828     *outptr = out;
829     return len - (end - in + 1);
830 #endif
831 }
832 
833 #endif
834