1import codecs
2import contextlib
3import io
4import locale
5import sys
6import unittest
7import encodings
8from unittest import mock
9
10from test import support
11from test.support import os_helper
12from test.support import warnings_helper
13
14try:
15    import _testcapi
16except ImportError:
17    _testcapi = None
18try:
19    import _testinternalcapi
20except ImportError:
21    _testinternalcapi = None
22
23try:
24    import ctypes
25except ImportError:
26    ctypes = None
27    SIZEOF_WCHAR_T = -1
28else:
29    SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
30
31def coding_checker(self, coder):
32    def check(input, expect):
33        self.assertEqual(coder(input), (expect, len(input)))
34    return check
35
36# On small versions of Windows like Windows IoT or Windows Nano Server not all codepages are present
37def is_code_page_present(cp):
38    from ctypes import POINTER, WINFUNCTYPE, WinDLL
39    from ctypes.wintypes import BOOL, BYTE, WCHAR, UINT, DWORD
40
41    MAX_LEADBYTES = 12  # 5 ranges, 2 bytes ea., 0 term.
42    MAX_DEFAULTCHAR = 2 # single or double byte
43    MAX_PATH = 260
44    class CPINFOEXW(ctypes.Structure):
45        _fields_ = [("MaxCharSize", UINT),
46                    ("DefaultChar", BYTE*MAX_DEFAULTCHAR),
47                    ("LeadByte", BYTE*MAX_LEADBYTES),
48                    ("UnicodeDefaultChar", WCHAR),
49                    ("CodePage", UINT),
50                    ("CodePageName", WCHAR*MAX_PATH)]
51
52    prototype = WINFUNCTYPE(BOOL, UINT, DWORD, POINTER(CPINFOEXW))
53    GetCPInfoEx = prototype(("GetCPInfoExW", WinDLL("kernel32")))
54    info = CPINFOEXW()
55    return GetCPInfoEx(cp, 0, info)
56
57class Queue(object):
58    """
59    queue: write bytes at one end, read bytes from the other end
60    """
61    def __init__(self, buffer):
62        self._buffer = buffer
63
64    def write(self, chars):
65        self._buffer += chars
66
67    def read(self, size=-1):
68        if size<0:
69            s = self._buffer
70            self._buffer = self._buffer[:0] # make empty
71            return s
72        else:
73            s = self._buffer[:size]
74            self._buffer = self._buffer[size:]
75            return s
76
77
78class MixInCheckStateHandling:
79    def check_state_handling_decode(self, encoding, u, s):
80        for i in range(len(s)+1):
81            d = codecs.getincrementaldecoder(encoding)()
82            part1 = d.decode(s[:i])
83            state = d.getstate()
84            self.assertIsInstance(state[1], int)
85            # Check that the condition stated in the documentation for
86            # IncrementalDecoder.getstate() holds
87            if not state[1]:
88                # reset decoder to the default state without anything buffered
89                d.setstate((state[0][:0], 0))
90                # Feeding the previous input may not produce any output
91                self.assertTrue(not d.decode(state[0]))
92                # The decoder must return to the same state
93                self.assertEqual(state, d.getstate())
94            # Create a new decoder and set it to the state
95            # we extracted from the old one
96            d = codecs.getincrementaldecoder(encoding)()
97            d.setstate(state)
98            part2 = d.decode(s[i:], True)
99            self.assertEqual(u, part1+part2)
100
101    def check_state_handling_encode(self, encoding, u, s):
102        for i in range(len(u)+1):
103            d = codecs.getincrementalencoder(encoding)()
104            part1 = d.encode(u[:i])
105            state = d.getstate()
106            d = codecs.getincrementalencoder(encoding)()
107            d.setstate(state)
108            part2 = d.encode(u[i:], True)
109            self.assertEqual(s, part1+part2)
110
111
112class ReadTest(MixInCheckStateHandling):
113    def check_partial(self, input, partialresults):
114        # get a StreamReader for the encoding and feed the bytestring version
115        # of input to the reader byte by byte. Read everything available from
116        # the StreamReader and check that the results equal the appropriate
117        # entries from partialresults.
118        q = Queue(b"")
119        r = codecs.getreader(self.encoding)(q)
120        result = ""
121        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
122            q.write(bytes([c]))
123            result += r.read()
124            self.assertEqual(result, partialresult)
125        # check that there's nothing left in the buffers
126        self.assertEqual(r.read(), "")
127        self.assertEqual(r.bytebuffer, b"")
128
129        # do the check again, this time using an incremental decoder
130        d = codecs.getincrementaldecoder(self.encoding)()
131        result = ""
132        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
133            result += d.decode(bytes([c]))
134            self.assertEqual(result, partialresult)
135        # check that there's nothing left in the buffers
136        self.assertEqual(d.decode(b"", True), "")
137        self.assertEqual(d.buffer, b"")
138
139        # Check whether the reset method works properly
140        d.reset()
141        result = ""
142        for (c, partialresult) in zip(input.encode(self.encoding), partialresults, strict=True):
143            result += d.decode(bytes([c]))
144            self.assertEqual(result, partialresult)
145        # check that there's nothing left in the buffers
146        self.assertEqual(d.decode(b"", True), "")
147        self.assertEqual(d.buffer, b"")
148
149        # check iterdecode()
150        encoded = input.encode(self.encoding)
151        self.assertEqual(
152            input,
153            "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
154        )
155
156    def test_readline(self):
157        def getreader(input):
158            stream = io.BytesIO(input.encode(self.encoding))
159            return codecs.getreader(self.encoding)(stream)
160
161        def readalllines(input, keepends=True, size=None):
162            reader = getreader(input)
163            lines = []
164            while True:
165                line = reader.readline(size=size, keepends=keepends)
166                if not line:
167                    break
168                lines.append(line)
169            return "|".join(lines)
170
171        s = "foo\nbar\r\nbaz\rspam\u2028eggs"
172        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
173        sexpectednoends = "foo|bar|baz|spam|eggs"
174        self.assertEqual(readalllines(s, True), sexpected)
175        self.assertEqual(readalllines(s, False), sexpectednoends)
176        self.assertEqual(readalllines(s, True, 10), sexpected)
177        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
178
179        lineends = ("\n", "\r\n", "\r", "\u2028")
180        # Test long lines (multiple calls to read() in readline())
181        vw = []
182        vwo = []
183        for (i, lineend) in enumerate(lineends):
184            vw.append((i*200+200)*"\u3042" + lineend)
185            vwo.append((i*200+200)*"\u3042")
186        self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
187        self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
188
189        # Test lines where the first read might end with \r, so the
190        # reader has to look ahead whether this is a lone \r or a \r\n
191        for size in range(80):
192            for lineend in lineends:
193                s = 10*(size*"a" + lineend + "xxx\n")
194                reader = getreader(s)
195                for i in range(10):
196                    self.assertEqual(
197                        reader.readline(keepends=True),
198                        size*"a" + lineend,
199                    )
200                    self.assertEqual(
201                        reader.readline(keepends=True),
202                        "xxx\n",
203                    )
204                reader = getreader(s)
205                for i in range(10):
206                    self.assertEqual(
207                        reader.readline(keepends=False),
208                        size*"a",
209                    )
210                    self.assertEqual(
211                        reader.readline(keepends=False),
212                        "xxx",
213                    )
214
215    def test_mixed_readline_and_read(self):
216        lines = ["Humpty Dumpty sat on a wall,\n",
217                 "Humpty Dumpty had a great fall.\r\n",
218                 "All the king's horses and all the king's men\r",
219                 "Couldn't put Humpty together again."]
220        data = ''.join(lines)
221        def getreader():
222            stream = io.BytesIO(data.encode(self.encoding))
223            return codecs.getreader(self.encoding)(stream)
224
225        # Issue #8260: Test readline() followed by read()
226        f = getreader()
227        self.assertEqual(f.readline(), lines[0])
228        self.assertEqual(f.read(), ''.join(lines[1:]))
229        self.assertEqual(f.read(), '')
230
231        # Issue #32110: Test readline() followed by read(n)
232        f = getreader()
233        self.assertEqual(f.readline(), lines[0])
234        self.assertEqual(f.read(1), lines[1][0])
235        self.assertEqual(f.read(0), '')
236        self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
237
238        # Issue #16636: Test readline() followed by readlines()
239        f = getreader()
240        self.assertEqual(f.readline(), lines[0])
241        self.assertEqual(f.readlines(), lines[1:])
242        self.assertEqual(f.read(), '')
243
244        # Test read(n) followed by read()
245        f = getreader()
246        self.assertEqual(f.read(size=40, chars=5), data[:5])
247        self.assertEqual(f.read(), data[5:])
248        self.assertEqual(f.read(), '')
249
250        # Issue #32110: Test read(n) followed by read(n)
251        f = getreader()
252        self.assertEqual(f.read(size=40, chars=5), data[:5])
253        self.assertEqual(f.read(1), data[5])
254        self.assertEqual(f.read(0), '')
255        self.assertEqual(f.read(100), data[6:106])
256
257        # Issue #12446: Test read(n) followed by readlines()
258        f = getreader()
259        self.assertEqual(f.read(size=40, chars=5), data[:5])
260        self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
261        self.assertEqual(f.read(), '')
262
263    def test_bug1175396(self):
264        s = [
265            '<%!--===================================================\r\n',
266            '    BLOG index page: show recent articles,\r\n',
267            '    today\'s articles, or articles of a specific date.\r\n',
268            '========================================================--%>\r\n',
269            '<%@inputencoding="ISO-8859-1"%>\r\n',
270            '<%@pagetemplate=TEMPLATE.y%>\r\n',
271            '<%@import=import frog.util, frog%>\r\n',
272            '<%@import=import frog.objects%>\r\n',
273            '<%@import=from frog.storageerrors import StorageError%>\r\n',
274            '<%\r\n',
275            '\r\n',
276            'import logging\r\n',
277            'log=logging.getLogger("Snakelets.logger")\r\n',
278            '\r\n',
279            '\r\n',
280            'user=self.SessionCtx.user\r\n',
281            'storageEngine=self.SessionCtx.storageEngine\r\n',
282            '\r\n',
283            '\r\n',
284            'def readArticlesFromDate(date, count=None):\r\n',
285            '    entryids=storageEngine.listBlogEntries(date)\r\n',
286            '    entryids.reverse() # descending\r\n',
287            '    if count:\r\n',
288            '        entryids=entryids[:count]\r\n',
289            '    try:\r\n',
290            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
291            '    except StorageError,x:\r\n',
292            '        log.error("Error loading articles: "+str(x))\r\n',
293            '        self.abort("cannot load articles")\r\n',
294            '\r\n',
295            'showdate=None\r\n',
296            '\r\n',
297            'arg=self.Request.getArg()\r\n',
298            'if arg=="today":\r\n',
299            '    #-------------------- TODAY\'S ARTICLES\r\n',
300            '    self.write("<h2>Today\'s articles</h2>")\r\n',
301            '    showdate = frog.util.isodatestr() \r\n',
302            '    entries = readArticlesFromDate(showdate)\r\n',
303            'elif arg=="active":\r\n',
304            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
305            '    self.Yredirect("active.y")\r\n',
306            'elif arg=="login":\r\n',
307            '    #-------------------- LOGIN PAGE redirect\r\n',
308            '    self.Yredirect("login.y")\r\n',
309            'elif arg=="date":\r\n',
310            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
311            '    showdate = self.Request.getParameter("date")\r\n',
312            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
313            '    entries = readArticlesFromDate(showdate)\r\n',
314            'else:\r\n',
315            '    #-------------------- RECENT ARTICLES\r\n',
316            '    self.write("<h2>Recent articles</h2>")\r\n',
317            '    dates=storageEngine.listBlogEntryDates()\r\n',
318            '    if dates:\r\n',
319            '        entries=[]\r\n',
320            '        SHOWAMOUNT=10\r\n',
321            '        for showdate in dates:\r\n',
322            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
323            '            if len(entries)>=SHOWAMOUNT:\r\n',
324            '                break\r\n',
325            '                \r\n',
326        ]
327        stream = io.BytesIO("".join(s).encode(self.encoding))
328        reader = codecs.getreader(self.encoding)(stream)
329        for (i, line) in enumerate(reader):
330            self.assertEqual(line, s[i])
331
332    def test_readlinequeue(self):
333        q = Queue(b"")
334        writer = codecs.getwriter(self.encoding)(q)
335        reader = codecs.getreader(self.encoding)(q)
336
337        # No lineends
338        writer.write("foo\r")
339        self.assertEqual(reader.readline(keepends=False), "foo")
340        writer.write("\nbar\r")
341        self.assertEqual(reader.readline(keepends=False), "")
342        self.assertEqual(reader.readline(keepends=False), "bar")
343        writer.write("baz")
344        self.assertEqual(reader.readline(keepends=False), "baz")
345        self.assertEqual(reader.readline(keepends=False), "")
346
347        # Lineends
348        writer.write("foo\r")
349        self.assertEqual(reader.readline(keepends=True), "foo\r")
350        writer.write("\nbar\r")
351        self.assertEqual(reader.readline(keepends=True), "\n")
352        self.assertEqual(reader.readline(keepends=True), "bar\r")
353        writer.write("baz")
354        self.assertEqual(reader.readline(keepends=True), "baz")
355        self.assertEqual(reader.readline(keepends=True), "")
356        writer.write("foo\r\n")
357        self.assertEqual(reader.readline(keepends=True), "foo\r\n")
358
359    def test_bug1098990_a(self):
360        s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
361        s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
362        s3 = "next line.\r\n"
363
364        s = (s1+s2+s3).encode(self.encoding)
365        stream = io.BytesIO(s)
366        reader = codecs.getreader(self.encoding)(stream)
367        self.assertEqual(reader.readline(), s1)
368        self.assertEqual(reader.readline(), s2)
369        self.assertEqual(reader.readline(), s3)
370        self.assertEqual(reader.readline(), "")
371
372    def test_bug1098990_b(self):
373        s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
374        s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
375        s3 = "stillokay:bbbbxx\r\n"
376        s4 = "broken!!!!badbad\r\n"
377        s5 = "againokay.\r\n"
378
379        s = (s1+s2+s3+s4+s5).encode(self.encoding)
380        stream = io.BytesIO(s)
381        reader = codecs.getreader(self.encoding)(stream)
382        self.assertEqual(reader.readline(), s1)
383        self.assertEqual(reader.readline(), s2)
384        self.assertEqual(reader.readline(), s3)
385        self.assertEqual(reader.readline(), s4)
386        self.assertEqual(reader.readline(), s5)
387        self.assertEqual(reader.readline(), "")
388
389    ill_formed_sequence_replace = "\ufffd"
390
391    def test_lone_surrogates(self):
392        self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
393        self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
394                         "[\\udc80]".encode(self.encoding))
395        self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
396                         "[\\udc80]".encode(self.encoding))
397        self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
398                         "[&#56448;]".encode(self.encoding))
399        self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
400                         "[]".encode(self.encoding))
401        self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
402                         "[?]".encode(self.encoding))
403
404        # sequential surrogate characters
405        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
406                         "[]".encode(self.encoding))
407        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
408                         "[??]".encode(self.encoding))
409
410        bom = "".encode(self.encoding)
411        for before, after in [("\U00010fff", "A"), ("[", "]"),
412                              ("A", "\U00010fff")]:
413            before_sequence = before.encode(self.encoding)[len(bom):]
414            after_sequence = after.encode(self.encoding)[len(bom):]
415            test_string = before + "\uDC80" + after
416            test_sequence = (bom + before_sequence +
417                             self.ill_formed_sequence + after_sequence)
418            self.assertRaises(UnicodeDecodeError, test_sequence.decode,
419                              self.encoding)
420            self.assertEqual(test_string.encode(self.encoding,
421                                                "surrogatepass"),
422                             test_sequence)
423            self.assertEqual(test_sequence.decode(self.encoding,
424                                                  "surrogatepass"),
425                             test_string)
426            self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
427                             before + after)
428            self.assertEqual(test_sequence.decode(self.encoding, "replace"),
429                             before + self.ill_formed_sequence_replace + after)
430            backslashreplace = ''.join('\\x%02x' % b
431                                       for b in self.ill_formed_sequence)
432            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
433                             before + backslashreplace + after)
434
435    def test_incremental_surrogatepass(self):
436        # Test incremental decoder for surrogatepass handler:
437        # see issue #24214
438        # High surrogate
439        data = '\uD901'.encode(self.encoding, 'surrogatepass')
440        for i in range(1, len(data)):
441            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
442            self.assertEqual(dec.decode(data[:i]), '')
443            self.assertEqual(dec.decode(data[i:], True), '\uD901')
444        # Low surrogate
445        data = '\uDC02'.encode(self.encoding, 'surrogatepass')
446        for i in range(1, len(data)):
447            dec = codecs.getincrementaldecoder(self.encoding)('surrogatepass')
448            self.assertEqual(dec.decode(data[:i]), '')
449            self.assertEqual(dec.decode(data[i:]), '\uDC02')
450
451
452class UTF32Test(ReadTest, unittest.TestCase):
453    encoding = "utf-32"
454    if sys.byteorder == 'little':
455        ill_formed_sequence = b"\x80\xdc\x00\x00"
456    else:
457        ill_formed_sequence = b"\x00\x00\xdc\x80"
458
459    spamle = (b'\xff\xfe\x00\x00'
460              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
461              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
462    spambe = (b'\x00\x00\xfe\xff'
463              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
464              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
465
466    def test_only_one_bom(self):
467        _,_,reader,writer = codecs.lookup(self.encoding)
468        # encode some stream
469        s = io.BytesIO()
470        f = writer(s)
471        f.write("spam")
472        f.write("spam")
473        d = s.getvalue()
474        # check whether there is exactly one BOM in it
475        self.assertTrue(d == self.spamle or d == self.spambe)
476        # try to read it back
477        s = io.BytesIO(d)
478        f = reader(s)
479        self.assertEqual(f.read(), "spamspam")
480
481    def test_badbom(self):
482        s = io.BytesIO(4*b"\xff")
483        f = codecs.getreader(self.encoding)(s)
484        self.assertRaises(UnicodeError, f.read)
485
486        s = io.BytesIO(8*b"\xff")
487        f = codecs.getreader(self.encoding)(s)
488        self.assertRaises(UnicodeError, f.read)
489
490    def test_partial(self):
491        self.check_partial(
492            "\x00\xff\u0100\uffff\U00010000",
493            [
494                "", # first byte of BOM read
495                "", # second byte of BOM read
496                "", # third byte of BOM read
497                "", # fourth byte of BOM read => byteorder known
498                "",
499                "",
500                "",
501                "\x00",
502                "\x00",
503                "\x00",
504                "\x00",
505                "\x00\xff",
506                "\x00\xff",
507                "\x00\xff",
508                "\x00\xff",
509                "\x00\xff\u0100",
510                "\x00\xff\u0100",
511                "\x00\xff\u0100",
512                "\x00\xff\u0100",
513                "\x00\xff\u0100\uffff",
514                "\x00\xff\u0100\uffff",
515                "\x00\xff\u0100\uffff",
516                "\x00\xff\u0100\uffff",
517                "\x00\xff\u0100\uffff\U00010000",
518            ]
519        )
520
521    def test_handlers(self):
522        self.assertEqual(('\ufffd', 1),
523                         codecs.utf_32_decode(b'\x01', 'replace', True))
524        self.assertEqual(('', 1),
525                         codecs.utf_32_decode(b'\x01', 'ignore', True))
526
527    def test_errors(self):
528        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
529                          b"\xff", "strict", True)
530
531    def test_decoder_state(self):
532        self.check_state_handling_decode(self.encoding,
533                                         "spamspam", self.spamle)
534        self.check_state_handling_decode(self.encoding,
535                                         "spamspam", self.spambe)
536
537    def test_issue8941(self):
538        # Issue #8941: insufficient result allocation when decoding into
539        # surrogate pairs on UCS-2 builds.
540        encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
541        self.assertEqual('\U00010000' * 1024,
542                         codecs.utf_32_decode(encoded_le)[0])
543        encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
544        self.assertEqual('\U00010000' * 1024,
545                         codecs.utf_32_decode(encoded_be)[0])
546
547
548class UTF32LETest(ReadTest, unittest.TestCase):
549    encoding = "utf-32-le"
550    ill_formed_sequence = b"\x80\xdc\x00\x00"
551
552    def test_partial(self):
553        self.check_partial(
554            "\x00\xff\u0100\uffff\U00010000",
555            [
556                "",
557                "",
558                "",
559                "\x00",
560                "\x00",
561                "\x00",
562                "\x00",
563                "\x00\xff",
564                "\x00\xff",
565                "\x00\xff",
566                "\x00\xff",
567                "\x00\xff\u0100",
568                "\x00\xff\u0100",
569                "\x00\xff\u0100",
570                "\x00\xff\u0100",
571                "\x00\xff\u0100\uffff",
572                "\x00\xff\u0100\uffff",
573                "\x00\xff\u0100\uffff",
574                "\x00\xff\u0100\uffff",
575                "\x00\xff\u0100\uffff\U00010000",
576            ]
577        )
578
579    def test_simple(self):
580        self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
581
582    def test_errors(self):
583        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
584                          b"\xff", "strict", True)
585
586    def test_issue8941(self):
587        # Issue #8941: insufficient result allocation when decoding into
588        # surrogate pairs on UCS-2 builds.
589        encoded = b'\x00\x00\x01\x00' * 1024
590        self.assertEqual('\U00010000' * 1024,
591                         codecs.utf_32_le_decode(encoded)[0])
592
593
594class UTF32BETest(ReadTest, unittest.TestCase):
595    encoding = "utf-32-be"
596    ill_formed_sequence = b"\x00\x00\xdc\x80"
597
598    def test_partial(self):
599        self.check_partial(
600            "\x00\xff\u0100\uffff\U00010000",
601            [
602                "",
603                "",
604                "",
605                "\x00",
606                "\x00",
607                "\x00",
608                "\x00",
609                "\x00\xff",
610                "\x00\xff",
611                "\x00\xff",
612                "\x00\xff",
613                "\x00\xff\u0100",
614                "\x00\xff\u0100",
615                "\x00\xff\u0100",
616                "\x00\xff\u0100",
617                "\x00\xff\u0100\uffff",
618                "\x00\xff\u0100\uffff",
619                "\x00\xff\u0100\uffff",
620                "\x00\xff\u0100\uffff",
621                "\x00\xff\u0100\uffff\U00010000",
622            ]
623        )
624
625    def test_simple(self):
626        self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
627
628    def test_errors(self):
629        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
630                          b"\xff", "strict", True)
631
632    def test_issue8941(self):
633        # Issue #8941: insufficient result allocation when decoding into
634        # surrogate pairs on UCS-2 builds.
635        encoded = b'\x00\x01\x00\x00' * 1024
636        self.assertEqual('\U00010000' * 1024,
637                         codecs.utf_32_be_decode(encoded)[0])
638
639
640class UTF16Test(ReadTest, unittest.TestCase):
641    encoding = "utf-16"
642    if sys.byteorder == 'little':
643        ill_formed_sequence = b"\x80\xdc"
644    else:
645        ill_formed_sequence = b"\xdc\x80"
646
647    spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
648    spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
649
650    def test_only_one_bom(self):
651        _,_,reader,writer = codecs.lookup(self.encoding)
652        # encode some stream
653        s = io.BytesIO()
654        f = writer(s)
655        f.write("spam")
656        f.write("spam")
657        d = s.getvalue()
658        # check whether there is exactly one BOM in it
659        self.assertTrue(d == self.spamle or d == self.spambe)
660        # try to read it back
661        s = io.BytesIO(d)
662        f = reader(s)
663        self.assertEqual(f.read(), "spamspam")
664
665    def test_badbom(self):
666        s = io.BytesIO(b"\xff\xff")
667        f = codecs.getreader(self.encoding)(s)
668        self.assertRaises(UnicodeError, f.read)
669
670        s = io.BytesIO(b"\xff\xff\xff\xff")
671        f = codecs.getreader(self.encoding)(s)
672        self.assertRaises(UnicodeError, f.read)
673
674    def test_partial(self):
675        self.check_partial(
676            "\x00\xff\u0100\uffff\U00010000",
677            [
678                "", # first byte of BOM read
679                "", # second byte of BOM read => byteorder known
680                "",
681                "\x00",
682                "\x00",
683                "\x00\xff",
684                "\x00\xff",
685                "\x00\xff\u0100",
686                "\x00\xff\u0100",
687                "\x00\xff\u0100\uffff",
688                "\x00\xff\u0100\uffff",
689                "\x00\xff\u0100\uffff",
690                "\x00\xff\u0100\uffff",
691                "\x00\xff\u0100\uffff\U00010000",
692            ]
693        )
694
695    def test_handlers(self):
696        self.assertEqual(('\ufffd', 1),
697                         codecs.utf_16_decode(b'\x01', 'replace', True))
698        self.assertEqual(('', 1),
699                         codecs.utf_16_decode(b'\x01', 'ignore', True))
700
701    def test_errors(self):
702        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
703                          b"\xff", "strict", True)
704
705    def test_decoder_state(self):
706        self.check_state_handling_decode(self.encoding,
707                                         "spamspam", self.spamle)
708        self.check_state_handling_decode(self.encoding,
709                                         "spamspam", self.spambe)
710
711    def test_bug691291(self):
712        # If encoding is not None, then
713        # files are always opened in binary mode, even if no binary mode was
714        # specified.  This means that no automatic conversion of '\n' is done
715        # on reading and writing.
716        s1 = 'Hello\r\nworld\r\n'
717
718        s = s1.encode(self.encoding)
719        self.addCleanup(os_helper.unlink, os_helper.TESTFN)
720        with open(os_helper.TESTFN, 'wb') as fp:
721            fp.write(s)
722        with codecs.open(os_helper.TESTFN, 'r',
723                         encoding=self.encoding) as reader:
724            self.assertEqual(reader.read(), s1)
725
726    def test_invalid_modes(self):
727        for mode in ('U', 'rU', 'r+U'):
728            with self.assertRaises(ValueError) as cm:
729                codecs.open(os_helper.TESTFN, mode, encoding=self.encoding)
730            self.assertIn('invalid mode', str(cm.exception))
731
732        for mode in ('rt', 'wt', 'at', 'r+t'):
733            with self.assertRaises(ValueError) as cm:
734                codecs.open(os_helper.TESTFN, mode, encoding=self.encoding)
735            self.assertIn("can't have text and binary mode at once",
736                          str(cm.exception))
737
738
739class UTF16LETest(ReadTest, unittest.TestCase):
740    encoding = "utf-16-le"
741    ill_formed_sequence = b"\x80\xdc"
742
743    def test_partial(self):
744        self.check_partial(
745            "\x00\xff\u0100\uffff\U00010000",
746            [
747                "",
748                "\x00",
749                "\x00",
750                "\x00\xff",
751                "\x00\xff",
752                "\x00\xff\u0100",
753                "\x00\xff\u0100",
754                "\x00\xff\u0100\uffff",
755                "\x00\xff\u0100\uffff",
756                "\x00\xff\u0100\uffff",
757                "\x00\xff\u0100\uffff",
758                "\x00\xff\u0100\uffff\U00010000",
759            ]
760        )
761
762    def test_errors(self):
763        tests = [
764            (b'\xff', '\ufffd'),
765            (b'A\x00Z', 'A\ufffd'),
766            (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
767            (b'\x00\xd8', '\ufffd'),
768            (b'\x00\xd8A', '\ufffd'),
769            (b'\x00\xd8A\x00', '\ufffdA'),
770            (b'\x00\xdcA\x00', '\ufffdA'),
771        ]
772        for raw, expected in tests:
773            self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
774                              raw, 'strict', True)
775            self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
776
777    def test_nonbmp(self):
778        self.assertEqual("\U00010203".encode(self.encoding),
779                         b'\x00\xd8\x03\xde')
780        self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
781                         "\U00010203")
782
783class UTF16BETest(ReadTest, unittest.TestCase):
784    encoding = "utf-16-be"
785    ill_formed_sequence = b"\xdc\x80"
786
787    def test_partial(self):
788        self.check_partial(
789            "\x00\xff\u0100\uffff\U00010000",
790            [
791                "",
792                "\x00",
793                "\x00",
794                "\x00\xff",
795                "\x00\xff",
796                "\x00\xff\u0100",
797                "\x00\xff\u0100",
798                "\x00\xff\u0100\uffff",
799                "\x00\xff\u0100\uffff",
800                "\x00\xff\u0100\uffff",
801                "\x00\xff\u0100\uffff",
802                "\x00\xff\u0100\uffff\U00010000",
803            ]
804        )
805
806    def test_errors(self):
807        tests = [
808            (b'\xff', '\ufffd'),
809            (b'\x00A\xff', 'A\ufffd'),
810            (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
811            (b'\xd8\x00', '\ufffd'),
812            (b'\xd8\x00\xdc', '\ufffd'),
813            (b'\xd8\x00\x00A', '\ufffdA'),
814            (b'\xdc\x00\x00A', '\ufffdA'),
815        ]
816        for raw, expected in tests:
817            self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
818                              raw, 'strict', True)
819            self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
820
821    def test_nonbmp(self):
822        self.assertEqual("\U00010203".encode(self.encoding),
823                         b'\xd8\x00\xde\x03')
824        self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
825                         "\U00010203")
826
827class UTF8Test(ReadTest, unittest.TestCase):
828    encoding = "utf-8"
829    ill_formed_sequence = b"\xed\xb2\x80"
830    ill_formed_sequence_replace = "\ufffd" * 3
831    BOM = b''
832
833    def test_partial(self):
834        self.check_partial(
835            "\x00\xff\u07ff\u0800\uffff\U00010000",
836            [
837                "\x00",
838                "\x00",
839                "\x00\xff",
840                "\x00\xff",
841                "\x00\xff\u07ff",
842                "\x00\xff\u07ff",
843                "\x00\xff\u07ff",
844                "\x00\xff\u07ff\u0800",
845                "\x00\xff\u07ff\u0800",
846                "\x00\xff\u07ff\u0800",
847                "\x00\xff\u07ff\u0800\uffff",
848                "\x00\xff\u07ff\u0800\uffff",
849                "\x00\xff\u07ff\u0800\uffff",
850                "\x00\xff\u07ff\u0800\uffff",
851                "\x00\xff\u07ff\u0800\uffff\U00010000",
852            ]
853        )
854
855    def test_decoder_state(self):
856        u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
857        self.check_state_handling_decode(self.encoding,
858                                         u, u.encode(self.encoding))
859
860    def test_decode_error(self):
861        for data, error_handler, expected in (
862            (b'[\x80\xff]', 'ignore', '[]'),
863            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
864            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
865            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
866        ):
867            with self.subTest(data=data, error_handler=error_handler,
868                              expected=expected):
869                self.assertEqual(data.decode(self.encoding, error_handler),
870                                 expected)
871
872    def test_lone_surrogates(self):
873        super().test_lone_surrogates()
874        # not sure if this is making sense for
875        # UTF-16 and UTF-32
876        self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
877                         self.BOM + b'[\x80]')
878
879        with self.assertRaises(UnicodeEncodeError) as cm:
880            "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
881        exc = cm.exception
882        self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
883
884    def test_surrogatepass_handler(self):
885        self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
886                         self.BOM + b"abc\xed\xa0\x80def")
887        self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
888                         self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
889        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
890                         self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
891
892        self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
893                         "abc\ud800def")
894        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
895                         "\U00010fff\uD800")
896
897        self.assertTrue(codecs.lookup_error("surrogatepass"))
898        with self.assertRaises(UnicodeDecodeError):
899            b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
900        with self.assertRaises(UnicodeDecodeError):
901            b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
902
903    def test_incremental_errors(self):
904        # Test that the incremental decoder can fail with final=False.
905        # See issue #24214
906        cases = [b'\x80', b'\xBF', b'\xC0', b'\xC1', b'\xF5', b'\xF6', b'\xFF']
907        for prefix in (b'\xC2', b'\xDF', b'\xE0', b'\xE0\xA0', b'\xEF',
908                       b'\xEF\xBF', b'\xF0', b'\xF0\x90', b'\xF0\x90\x80',
909                       b'\xF4', b'\xF4\x8F', b'\xF4\x8F\xBF'):
910            for suffix in b'\x7F', b'\xC0':
911                cases.append(prefix + suffix)
912        cases.extend((b'\xE0\x80', b'\xE0\x9F', b'\xED\xA0\x80',
913                      b'\xED\xBF\xBF', b'\xF0\x80', b'\xF0\x8F', b'\xF4\x90'))
914
915        for data in cases:
916            with self.subTest(data=data):
917                dec = codecs.getincrementaldecoder(self.encoding)()
918                self.assertRaises(UnicodeDecodeError, dec.decode, data)
919
920
921class UTF7Test(ReadTest, unittest.TestCase):
922    encoding = "utf-7"
923
924    def test_ascii(self):
925        # Set D (directly encoded characters)
926        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
927                 'abcdefghijklmnopqrstuvwxyz'
928                 '0123456789'
929                 '\'(),-./:?')
930        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
931        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
932        # Set O (optional direct characters)
933        set_o = ' !"#$%&*;<=>@[]^_`{|}'
934        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
935        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
936        # +
937        self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
938        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
939        # White spaces
940        ws = ' \t\n\r'
941        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
942        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
943        # Other ASCII characters
944        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
945                                     set(set_d + set_o + '+' + ws)))
946        self.assertEqual(other_ascii.encode(self.encoding),
947                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
948                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
949
950    def test_partial(self):
951        self.check_partial(
952            'a+-b\x00c\x80d\u0100e\U00010000f',
953            [
954                'a',
955                'a',
956                'a+',
957                'a+-',
958                'a+-b',
959                'a+-b',
960                'a+-b',
961                'a+-b',
962                'a+-b',
963                'a+-b\x00',
964                'a+-b\x00c',
965                'a+-b\x00c',
966                'a+-b\x00c',
967                'a+-b\x00c',
968                'a+-b\x00c',
969                'a+-b\x00c\x80',
970                'a+-b\x00c\x80d',
971                'a+-b\x00c\x80d',
972                'a+-b\x00c\x80d',
973                'a+-b\x00c\x80d',
974                'a+-b\x00c\x80d',
975                'a+-b\x00c\x80d\u0100',
976                'a+-b\x00c\x80d\u0100e',
977                'a+-b\x00c\x80d\u0100e',
978                'a+-b\x00c\x80d\u0100e',
979                'a+-b\x00c\x80d\u0100e',
980                'a+-b\x00c\x80d\u0100e',
981                'a+-b\x00c\x80d\u0100e',
982                'a+-b\x00c\x80d\u0100e',
983                'a+-b\x00c\x80d\u0100e',
984                'a+-b\x00c\x80d\u0100e\U00010000',
985                'a+-b\x00c\x80d\u0100e\U00010000f',
986            ]
987        )
988
989    def test_errors(self):
990        tests = [
991            (b'\xffb', '\ufffdb'),
992            (b'a\xffb', 'a\ufffdb'),
993            (b'a\xff\xffb', 'a\ufffd\ufffdb'),
994            (b'a+IK', 'a\ufffd'),
995            (b'a+IK-b', 'a\ufffdb'),
996            (b'a+IK,b', 'a\ufffdb'),
997            (b'a+IKx', 'a\u20ac\ufffd'),
998            (b'a+IKx-b', 'a\u20ac\ufffdb'),
999            (b'a+IKwgr', 'a\u20ac\ufffd'),
1000            (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1001            (b'a+IKwgr,', 'a\u20ac\ufffd'),
1002            (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1003            (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1004            (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1005            (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1006            (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1007            (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1008            (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
1009            (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1010            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
1011            (b'a+@b', 'a\ufffdb'),
1012        ]
1013        for raw, expected in tests:
1014            with self.subTest(raw=raw):
1015                self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1016                                raw, 'strict', True)
1017                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1018
1019    def test_nonbmp(self):
1020        self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1021        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1022        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
1023        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1024        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1025        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1026        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1027        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1028                         b'+IKwgrNgB3KA-')
1029        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1030                         '\u20ac\u20ac\U000104A0')
1031        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1032                         '\u20ac\u20ac\U000104A0')
1033
1034    def test_lone_surrogates(self):
1035        tests = [
1036            (b'a+2AE-b', 'a\ud801b'),
1037            (b'a+2AE\xffb', 'a\ufffdb'),
1038            (b'a+2AE', 'a\ufffd'),
1039            (b'a+2AEA-b', 'a\ufffdb'),
1040            (b'a+2AH-b', 'a\ufffdb'),
1041            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1042            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1043            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1044            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1045            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1046            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1047            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1048            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1049        ]
1050        for raw, expected in tests:
1051            with self.subTest(raw=raw):
1052                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1053
1054
1055class UTF16ExTest(unittest.TestCase):
1056
1057    def test_errors(self):
1058        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
1059
1060    def test_bad_args(self):
1061        self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1062
1063class ReadBufferTest(unittest.TestCase):
1064
1065    def test_array(self):
1066        import array
1067        self.assertEqual(
1068            codecs.readbuffer_encode(array.array("b", b"spam")),
1069            (b"spam", 4)
1070        )
1071
1072    def test_empty(self):
1073        self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
1074
1075    def test_bad_args(self):
1076        self.assertRaises(TypeError, codecs.readbuffer_encode)
1077        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1078
1079class UTF8SigTest(UTF8Test, unittest.TestCase):
1080    encoding = "utf-8-sig"
1081    BOM = codecs.BOM_UTF8
1082
1083    def test_partial(self):
1084        self.check_partial(
1085            "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1086            [
1087                "",
1088                "",
1089                "", # First BOM has been read and skipped
1090                "",
1091                "",
1092                "\ufeff", # Second BOM has been read and emitted
1093                "\ufeff\x00", # "\x00" read and emitted
1094                "\ufeff\x00", # First byte of encoded "\xff" read
1095                "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1096                "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1097                "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
1098                "\ufeff\x00\xff\u07ff",
1099                "\ufeff\x00\xff\u07ff",
1100                "\ufeff\x00\xff\u07ff\u0800",
1101                "\ufeff\x00\xff\u07ff\u0800",
1102                "\ufeff\x00\xff\u07ff\u0800",
1103                "\ufeff\x00\xff\u07ff\u0800\uffff",
1104                "\ufeff\x00\xff\u07ff\u0800\uffff",
1105                "\ufeff\x00\xff\u07ff\u0800\uffff",
1106                "\ufeff\x00\xff\u07ff\u0800\uffff",
1107                "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1108            ]
1109        )
1110
1111    def test_bug1601501(self):
1112        # SF bug #1601501: check that the codec works with a buffer
1113        self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
1114
1115    def test_bom(self):
1116        d = codecs.getincrementaldecoder("utf-8-sig")()
1117        s = "spam"
1118        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1119
1120    def test_stream_bom(self):
1121        unistring = "ABC\u00A1\u2200XYZ"
1122        bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1123
1124        reader = codecs.getreader("utf-8-sig")
1125        for sizehint in [None] + list(range(1, 11)) + \
1126                        [64, 128, 256, 512, 1024]:
1127            istream = reader(io.BytesIO(bytestring))
1128            ostream = io.StringIO()
1129            while 1:
1130                if sizehint is not None:
1131                    data = istream.read(sizehint)
1132                else:
1133                    data = istream.read()
1134
1135                if not data:
1136                    break
1137                ostream.write(data)
1138
1139            got = ostream.getvalue()
1140            self.assertEqual(got, unistring)
1141
1142    def test_stream_bare(self):
1143        unistring = "ABC\u00A1\u2200XYZ"
1144        bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1145
1146        reader = codecs.getreader("utf-8-sig")
1147        for sizehint in [None] + list(range(1, 11)) + \
1148                        [64, 128, 256, 512, 1024]:
1149            istream = reader(io.BytesIO(bytestring))
1150            ostream = io.StringIO()
1151            while 1:
1152                if sizehint is not None:
1153                    data = istream.read(sizehint)
1154                else:
1155                    data = istream.read()
1156
1157                if not data:
1158                    break
1159                ostream.write(data)
1160
1161            got = ostream.getvalue()
1162            self.assertEqual(got, unistring)
1163
1164
1165class EscapeDecodeTest(unittest.TestCase):
1166    def test_empty(self):
1167        self.assertEqual(codecs.escape_decode(b""), (b"", 0))
1168        self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
1169
1170    def test_raw(self):
1171        decode = codecs.escape_decode
1172        for b in range(256):
1173            b = bytes([b])
1174            if b != b'\\':
1175                self.assertEqual(decode(b + b'0'), (b + b'0', 2))
1176
1177    def test_escape(self):
1178        decode = codecs.escape_decode
1179        check = coding_checker(self, decode)
1180        check(b"[\\\n]", b"[]")
1181        check(br'[\"]', b'["]')
1182        check(br"[\']", b"[']")
1183        check(br"[\\]", b"[\\]")
1184        check(br"[\a]", b"[\x07]")
1185        check(br"[\b]", b"[\x08]")
1186        check(br"[\t]", b"[\x09]")
1187        check(br"[\n]", b"[\x0a]")
1188        check(br"[\v]", b"[\x0b]")
1189        check(br"[\f]", b"[\x0c]")
1190        check(br"[\r]", b"[\x0d]")
1191        check(br"[\7]", b"[\x07]")
1192        check(br"[\78]", b"[\x078]")
1193        check(br"[\41]", b"[!]")
1194        check(br"[\418]", b"[!8]")
1195        check(br"[\101]", b"[A]")
1196        check(br"[\1010]", b"[A0]")
1197        check(br"[\x41]", b"[A]")
1198        check(br"[\x410]", b"[A0]")
1199        for i in range(97, 123):
1200            b = bytes([i])
1201            if b not in b'abfnrtvx':
1202                with self.assertWarns(DeprecationWarning):
1203                    check(b"\\" + b, b"\\" + b)
1204            with self.assertWarns(DeprecationWarning):
1205                check(b"\\" + b.upper(), b"\\" + b.upper())
1206        with self.assertWarns(DeprecationWarning):
1207            check(br"\8", b"\\8")
1208        with self.assertWarns(DeprecationWarning):
1209            check(br"\9", b"\\9")
1210        with self.assertWarns(DeprecationWarning):
1211            check(b"\\\xfa", b"\\\xfa")
1212        for i in range(0o400, 0o1000):
1213            with self.assertWarns(DeprecationWarning):
1214                check(rb'\%o' % i, bytes([i & 0o377]))
1215
1216    def test_errors(self):
1217        decode = codecs.escape_decode
1218        self.assertRaises(ValueError, decode, br"\x")
1219        self.assertRaises(ValueError, decode, br"[\x]")
1220        self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1221        self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1222        self.assertRaises(ValueError, decode, br"\x0")
1223        self.assertRaises(ValueError, decode, br"[\x0]")
1224        self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1225        self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
1226
1227
1228# From RFC 3492
1229punycode_testcases = [
1230    # A Arabic (Egyptian):
1231    ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1232     "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
1233     b"egbpdaj6bu4bxfgehfvwxn"),
1234    # B Chinese (simplified):
1235    ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
1236     b"ihqwcrb4cv8a8dqg056pqjye"),
1237    # C Chinese (traditional):
1238    ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
1239     b"ihqwctvzc91f659drss3x8bo0yb"),
1240    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
1241    ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1242     "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1243     "\u0065\u0073\u006B\u0079",
1244     b"Proprostnemluvesky-uyb24dma41a"),
1245    # E Hebrew:
1246    ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1247     "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1248     "\u05D1\u05E8\u05D9\u05EA",
1249     b"4dbcagdahymbxekheh6e0a7fei0b"),
1250    # F Hindi (Devanagari):
1251    ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
1252     "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1253     "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1254     "\u0939\u0948\u0902",
1255     b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
1256
1257    #(G) Japanese (kanji and hiragana):
1258    ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
1259     "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1260     b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
1261
1262    # (H) Korean (Hangul syllables):
1263    ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1264     "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1265     "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
1266     b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1267     b"psd879ccm6fea98c"),
1268
1269    # (I) Russian (Cyrillic):
1270    ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1271     "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1272     "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1273     "\u0438",
1274     b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1275
1276    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1277    ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1278     "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1279     "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1280     "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1281     "\u0061\u00F1\u006F\u006C",
1282     b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
1283
1284    # (K) Vietnamese:
1285    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1286    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1287    ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1288     "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1289     "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1290     "\u0056\u0069\u1EC7\u0074",
1291     b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1292
1293    #(L) 3<nen>B<gumi><kinpachi><sensei>
1294    ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1295     b"3B-ww4c5e180e575a65lsy2b"),
1296
1297    # (M) <amuro><namie>-with-SUPER-MONKEYS
1298    ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1299     "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1300     "\u004F\u004E\u004B\u0045\u0059\u0053",
1301     b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1302
1303    # (N) Hello-Another-Way-<sorezore><no><basho>
1304    ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1305     "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1306     "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1307     b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1308
1309    # (O) <hitotsu><yane><no><shita>2
1310    ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1311     b"2-u9tlzr9756bt3uc0v"),
1312
1313    # (P) Maji<de>Koi<suru>5<byou><mae>
1314    ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1315     "\u308B\u0035\u79D2\u524D",
1316     b"MajiKoi5-783gue6qz075azm5e"),
1317
1318     # (Q) <pafii>de<runba>
1319    ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1320     b"de-jg4avhby1noc0d"),
1321
1322    # (R) <sono><supiido><de>
1323    ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1324     b"d9juau41awczczp"),
1325
1326    # (S) -> $1.00 <-
1327    ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1328     "\u003C\u002D",
1329     b"-> $1.00 <--")
1330    ]
1331
1332for i in punycode_testcases:
1333    if len(i)!=2:
1334        print(repr(i))
1335
1336
1337class PunycodeTest(unittest.TestCase):
1338    def test_encode(self):
1339        for uni, puny in punycode_testcases:
1340            # Need to convert both strings to lower case, since
1341            # some of the extended encodings use upper case, but our
1342            # code produces only lower case. Converting just puny to
1343            # lower is also insufficient, since some of the input characters
1344            # are upper case.
1345            self.assertEqual(
1346                str(uni.encode("punycode"), "ascii").lower(),
1347                str(puny, "ascii").lower()
1348            )
1349
1350    def test_decode(self):
1351        for uni, puny in punycode_testcases:
1352            self.assertEqual(uni, puny.decode("punycode"))
1353            puny = puny.decode("ascii").encode("ascii")
1354            self.assertEqual(uni, puny.decode("punycode"))
1355
1356    def test_decode_invalid(self):
1357        testcases = [
1358            (b"xn--w&", "strict", UnicodeError()),
1359            (b"xn--w&", "ignore", "xn-"),
1360        ]
1361        for puny, errors, expected in testcases:
1362            with self.subTest(puny=puny, errors=errors):
1363                if isinstance(expected, Exception):
1364                    self.assertRaises(UnicodeError, puny.decode, "punycode", errors)
1365                else:
1366                    self.assertEqual(puny.decode("punycode", errors), expected)
1367
1368
1369# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1370nameprep_tests = [
1371    # 3.1 Map to nothing.
1372    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1373     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1374     b'\xb8\x8f\xef\xbb\xbf',
1375     b'foobarbaz'),
1376    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1377    (b'CAFE',
1378     b'cafe'),
1379    # 3.3 Case folding 8bit U+00DF (german sharp s).
1380    # The original test case is bogus; it says \xc3\xdf
1381    (b'\xc3\x9f',
1382     b'ss'),
1383    # 3.4 Case folding U+0130 (turkish capital I with dot).
1384    (b'\xc4\xb0',
1385     b'i\xcc\x87'),
1386    # 3.5 Case folding multibyte U+0143 U+037A.
1387    (b'\xc5\x83\xcd\xba',
1388     b'\xc5\x84 \xce\xb9'),
1389    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1390    # XXX: skip this as it fails in UCS-2 mode
1391    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1392    # 'telc\xe2\x88\x95kg\xcf\x83'),
1393    (None, None),
1394    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1395    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1396     b'\xc7\xb0 a'),
1397    # 3.8 Case folding U+1FB7 and normalization.
1398    (b'\xe1\xbe\xb7',
1399     b'\xe1\xbe\xb6\xce\xb9'),
1400    # 3.9 Self-reverting case folding U+01F0 and normalization.
1401    # The original test case is bogus, it says `\xc7\xf0'
1402    (b'\xc7\xb0',
1403     b'\xc7\xb0'),
1404    # 3.10 Self-reverting case folding U+0390 and normalization.
1405    (b'\xce\x90',
1406     b'\xce\x90'),
1407    # 3.11 Self-reverting case folding U+03B0 and normalization.
1408    (b'\xce\xb0',
1409     b'\xce\xb0'),
1410    # 3.12 Self-reverting case folding U+1E96 and normalization.
1411    (b'\xe1\xba\x96',
1412     b'\xe1\xba\x96'),
1413    # 3.13 Self-reverting case folding U+1F56 and normalization.
1414    (b'\xe1\xbd\x96',
1415     b'\xe1\xbd\x96'),
1416    # 3.14 ASCII space character U+0020.
1417    (b' ',
1418     b' '),
1419    # 3.15 Non-ASCII 8bit space character U+00A0.
1420    (b'\xc2\xa0',
1421     b' '),
1422    # 3.16 Non-ASCII multibyte space character U+1680.
1423    (b'\xe1\x9a\x80',
1424     None),
1425    # 3.17 Non-ASCII multibyte space character U+2000.
1426    (b'\xe2\x80\x80',
1427     b' '),
1428    # 3.18 Zero Width Space U+200b.
1429    (b'\xe2\x80\x8b',
1430     b''),
1431    # 3.19 Non-ASCII multibyte space character U+3000.
1432    (b'\xe3\x80\x80',
1433     b' '),
1434    # 3.20 ASCII control characters U+0010 U+007F.
1435    (b'\x10\x7f',
1436     b'\x10\x7f'),
1437    # 3.21 Non-ASCII 8bit control character U+0085.
1438    (b'\xc2\x85',
1439     None),
1440    # 3.22 Non-ASCII multibyte control character U+180E.
1441    (b'\xe1\xa0\x8e',
1442     None),
1443    # 3.23 Zero Width No-Break Space U+FEFF.
1444    (b'\xef\xbb\xbf',
1445     b''),
1446    # 3.24 Non-ASCII control character U+1D175.
1447    (b'\xf0\x9d\x85\xb5',
1448     None),
1449    # 3.25 Plane 0 private use character U+F123.
1450    (b'\xef\x84\xa3',
1451     None),
1452    # 3.26 Plane 15 private use character U+F1234.
1453    (b'\xf3\xb1\x88\xb4',
1454     None),
1455    # 3.27 Plane 16 private use character U+10F234.
1456    (b'\xf4\x8f\x88\xb4',
1457     None),
1458    # 3.28 Non-character code point U+8FFFE.
1459    (b'\xf2\x8f\xbf\xbe',
1460     None),
1461    # 3.29 Non-character code point U+10FFFF.
1462    (b'\xf4\x8f\xbf\xbf',
1463     None),
1464    # 3.30 Surrogate code U+DF42.
1465    (b'\xed\xbd\x82',
1466     None),
1467    # 3.31 Non-plain text character U+FFFD.
1468    (b'\xef\xbf\xbd',
1469     None),
1470    # 3.32 Ideographic description character U+2FF5.
1471    (b'\xe2\xbf\xb5',
1472     None),
1473    # 3.33 Display property character U+0341.
1474    (b'\xcd\x81',
1475     b'\xcc\x81'),
1476    # 3.34 Left-to-right mark U+200E.
1477    (b'\xe2\x80\x8e',
1478     None),
1479    # 3.35 Deprecated U+202A.
1480    (b'\xe2\x80\xaa',
1481     None),
1482    # 3.36 Language tagging character U+E0001.
1483    (b'\xf3\xa0\x80\x81',
1484     None),
1485    # 3.37 Language tagging character U+E0042.
1486    (b'\xf3\xa0\x81\x82',
1487     None),
1488    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1489    (b'foo\xd6\xbebar',
1490     None),
1491    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1492    (b'foo\xef\xb5\x90bar',
1493     None),
1494    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1495    (b'foo\xef\xb9\xb6bar',
1496     b'foo \xd9\x8ebar'),
1497    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1498    (b'\xd8\xa71',
1499     None),
1500    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1501    (b'\xd8\xa71\xd8\xa8',
1502     b'\xd8\xa71\xd8\xa8'),
1503    # 3.43 Unassigned code point U+E0002.
1504    # Skip this test as we allow unassigned
1505    #(b'\xf3\xa0\x80\x82',
1506    # None),
1507    (None, None),
1508    # 3.44 Larger test (shrinking).
1509    # Original test case reads \xc3\xdf
1510    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1511     b'\xaa\xce\xb0\xe2\x80\x80',
1512     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1513    # 3.45 Larger test (expanding).
1514    # Original test case reads \xc3\x9f
1515    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1516     b'\x80',
1517     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1518     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1519     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1520    ]
1521
1522
1523class NameprepTest(unittest.TestCase):
1524    def test_nameprep(self):
1525        from encodings.idna import nameprep
1526        for pos, (orig, prepped) in enumerate(nameprep_tests):
1527            if orig is None:
1528                # Skipped
1529                continue
1530            # The Unicode strings are given in UTF-8
1531            orig = str(orig, "utf-8", "surrogatepass")
1532            if prepped is None:
1533                # Input contains prohibited characters
1534                self.assertRaises(UnicodeError, nameprep, orig)
1535            else:
1536                prepped = str(prepped, "utf-8", "surrogatepass")
1537                try:
1538                    self.assertEqual(nameprep(orig), prepped)
1539                except Exception as e:
1540                    raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1541
1542
1543class IDNACodecTest(unittest.TestCase):
1544    def test_builtin_decode(self):
1545        self.assertEqual(str(b"python.org", "idna"), "python.org")
1546        self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1547        self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1548        self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
1549
1550    def test_builtin_encode(self):
1551        self.assertEqual("python.org".encode("idna"), b"python.org")
1552        self.assertEqual("python.org.".encode("idna"), b"python.org.")
1553        self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1554        self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
1555
1556    def test_builtin_decode_length_limit(self):
1557        with self.assertRaisesRegex(UnicodeError, "too long"):
1558            (b"xn--016c"+b"a"*1100).decode("idna")
1559        with self.assertRaisesRegex(UnicodeError, "too long"):
1560            (b"xn--016c"+b"a"*70).decode("idna")
1561
1562    def test_stream(self):
1563        r = codecs.getreader("idna")(io.BytesIO(b"abc"))
1564        r.read(3)
1565        self.assertEqual(r.read(), "")
1566
1567    def test_incremental_decode(self):
1568        self.assertEqual(
1569            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
1570            "python.org"
1571        )
1572        self.assertEqual(
1573            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
1574            "python.org."
1575        )
1576        self.assertEqual(
1577            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1578            "pyth\xf6n.org."
1579        )
1580        self.assertEqual(
1581            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1582            "pyth\xf6n.org."
1583        )
1584
1585        decoder = codecs.getincrementaldecoder("idna")()
1586        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1587        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1588        self.assertEqual(decoder.decode(b"rg"), "")
1589        self.assertEqual(decoder.decode(b"", True), "org")
1590
1591        decoder.reset()
1592        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1593        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1594        self.assertEqual(decoder.decode(b"rg."), "org.")
1595        self.assertEqual(decoder.decode(b"", True), "")
1596
1597    def test_incremental_encode(self):
1598        self.assertEqual(
1599            b"".join(codecs.iterencode("python.org", "idna")),
1600            b"python.org"
1601        )
1602        self.assertEqual(
1603            b"".join(codecs.iterencode("python.org.", "idna")),
1604            b"python.org."
1605        )
1606        self.assertEqual(
1607            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1608            b"xn--pythn-mua.org."
1609        )
1610        self.assertEqual(
1611            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1612            b"xn--pythn-mua.org."
1613        )
1614
1615        encoder = codecs.getincrementalencoder("idna")()
1616        self.assertEqual(encoder.encode("\xe4x"), b"")
1617        self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1618        self.assertEqual(encoder.encode("", True), b"org")
1619
1620        encoder.reset()
1621        self.assertEqual(encoder.encode("\xe4x"), b"")
1622        self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1623        self.assertEqual(encoder.encode("", True), b"")
1624
1625    def test_errors(self):
1626        """Only supports "strict" error handler"""
1627        "python.org".encode("idna", "strict")
1628        b"python.org".decode("idna", "strict")
1629        for errors in ("ignore", "replace", "backslashreplace",
1630                "surrogateescape"):
1631            self.assertRaises(Exception, "python.org".encode, "idna", errors)
1632            self.assertRaises(Exception,
1633                b"python.org".decode, "idna", errors)
1634
1635
1636class CodecsModuleTest(unittest.TestCase):
1637
1638    def test_decode(self):
1639        self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1640                         '\xe4\xf6\xfc')
1641        self.assertRaises(TypeError, codecs.decode)
1642        self.assertEqual(codecs.decode(b'abc'), 'abc')
1643        self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
1644
1645        # test keywords
1646        self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1647                         '\xe4\xf6\xfc')
1648        self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1649                         '[]')
1650
1651    def test_encode(self):
1652        self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1653                         b'\xe4\xf6\xfc')
1654        self.assertRaises(TypeError, codecs.encode)
1655        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1656        self.assertEqual(codecs.encode('abc'), b'abc')
1657        self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
1658
1659        # test keywords
1660        self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1661                         b'\xe4\xf6\xfc')
1662        self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1663                         b'[]')
1664
1665    def test_register(self):
1666        self.assertRaises(TypeError, codecs.register)
1667        self.assertRaises(TypeError, codecs.register, 42)
1668
1669    def test_unregister(self):
1670        name = "nonexistent_codec_name"
1671        search_function = mock.Mock()
1672        codecs.register(search_function)
1673        self.assertRaises(TypeError, codecs.lookup, name)
1674        search_function.assert_called_with(name)
1675        search_function.reset_mock()
1676
1677        codecs.unregister(search_function)
1678        self.assertRaises(LookupError, codecs.lookup, name)
1679        search_function.assert_not_called()
1680
1681    def test_lookup(self):
1682        self.assertRaises(TypeError, codecs.lookup)
1683        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1684        self.assertRaises(LookupError, codecs.lookup, " ")
1685
1686    def test_getencoder(self):
1687        self.assertRaises(TypeError, codecs.getencoder)
1688        self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1689
1690    def test_getdecoder(self):
1691        self.assertRaises(TypeError, codecs.getdecoder)
1692        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1693
1694    def test_getreader(self):
1695        self.assertRaises(TypeError, codecs.getreader)
1696        self.assertRaises(LookupError, codecs.getreader, "__spam__")
1697
1698    def test_getwriter(self):
1699        self.assertRaises(TypeError, codecs.getwriter)
1700        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1701
1702    def test_lookup_issue1813(self):
1703        # Issue #1813: under Turkish locales, lookup of some codecs failed
1704        # because 'I' is lowercased as "ı" (dotless i)
1705        oldlocale = locale.setlocale(locale.LC_CTYPE)
1706        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1707        try:
1708            locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1709        except locale.Error:
1710            # Unsupported locale on this system
1711            self.skipTest('test needs Turkish locale')
1712        c = codecs.lookup('ASCII')
1713        self.assertEqual(c.name, 'ascii')
1714
1715    def test_all(self):
1716        api = (
1717            "encode", "decode",
1718            "register", "CodecInfo", "Codec", "IncrementalEncoder",
1719            "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1720            "getencoder", "getdecoder", "getincrementalencoder",
1721            "getincrementaldecoder", "getreader", "getwriter",
1722            "register_error", "lookup_error",
1723            "strict_errors", "replace_errors", "ignore_errors",
1724            "xmlcharrefreplace_errors", "backslashreplace_errors",
1725            "namereplace_errors",
1726            "open", "EncodedFile",
1727            "iterencode", "iterdecode",
1728            "BOM", "BOM_BE", "BOM_LE",
1729            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1730            "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1731            "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
1732            "StreamReaderWriter", "StreamRecoder",
1733        )
1734        self.assertCountEqual(api, codecs.__all__)
1735        for api in codecs.__all__:
1736            getattr(codecs, api)
1737
1738    def test_open(self):
1739        self.addCleanup(os_helper.unlink, os_helper.TESTFN)
1740        for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1741            with self.subTest(mode), \
1742                    codecs.open(os_helper.TESTFN, mode, 'ascii') as file:
1743                self.assertIsInstance(file, codecs.StreamReaderWriter)
1744
1745    def test_undefined(self):
1746        self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1747        self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1748        self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1749        self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1750        for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1751            self.assertRaises(UnicodeError,
1752                codecs.encode, 'abc', 'undefined', errors)
1753            self.assertRaises(UnicodeError,
1754                codecs.decode, b'abc', 'undefined', errors)
1755
1756    def test_file_closes_if_lookup_error_raised(self):
1757        mock_open = mock.mock_open()
1758        with mock.patch('builtins.open', mock_open) as file:
1759            with self.assertRaises(LookupError):
1760                codecs.open(os_helper.TESTFN, 'wt', 'invalid-encoding')
1761
1762            file().close.assert_called()
1763
1764
1765class StreamReaderTest(unittest.TestCase):
1766
1767    def setUp(self):
1768        self.reader = codecs.getreader('utf-8')
1769        self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1770
1771    def test_readlines(self):
1772        f = self.reader(self.stream)
1773        self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
1774
1775
1776class EncodedFileTest(unittest.TestCase):
1777
1778    def test_basic(self):
1779        f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1780        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1781        self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
1782
1783        f = io.BytesIO()
1784        ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
1785        ef.write(b'\xc3\xbc')
1786        self.assertEqual(f.getvalue(), b'\xfc')
1787
1788all_unicode_encodings = [
1789    "ascii",
1790    "big5",
1791    "big5hkscs",
1792    "charmap",
1793    "cp037",
1794    "cp1006",
1795    "cp1026",
1796    "cp1125",
1797    "cp1140",
1798    "cp1250",
1799    "cp1251",
1800    "cp1252",
1801    "cp1253",
1802    "cp1254",
1803    "cp1255",
1804    "cp1256",
1805    "cp1257",
1806    "cp1258",
1807    "cp424",
1808    "cp437",
1809    "cp500",
1810    "cp720",
1811    "cp737",
1812    "cp775",
1813    "cp850",
1814    "cp852",
1815    "cp855",
1816    "cp856",
1817    "cp857",
1818    "cp858",
1819    "cp860",
1820    "cp861",
1821    "cp862",
1822    "cp863",
1823    "cp864",
1824    "cp865",
1825    "cp866",
1826    "cp869",
1827    "cp874",
1828    "cp875",
1829    "cp932",
1830    "cp949",
1831    "cp950",
1832    "euc_jis_2004",
1833    "euc_jisx0213",
1834    "euc_jp",
1835    "euc_kr",
1836    "gb18030",
1837    "gb2312",
1838    "gbk",
1839    "hp_roman8",
1840    "hz",
1841    "idna",
1842    "iso2022_jp",
1843    "iso2022_jp_1",
1844    "iso2022_jp_2",
1845    "iso2022_jp_2004",
1846    "iso2022_jp_3",
1847    "iso2022_jp_ext",
1848    "iso2022_kr",
1849    "iso8859_1",
1850    "iso8859_10",
1851    "iso8859_11",
1852    "iso8859_13",
1853    "iso8859_14",
1854    "iso8859_15",
1855    "iso8859_16",
1856    "iso8859_2",
1857    "iso8859_3",
1858    "iso8859_4",
1859    "iso8859_5",
1860    "iso8859_6",
1861    "iso8859_7",
1862    "iso8859_8",
1863    "iso8859_9",
1864    "johab",
1865    "koi8_r",
1866    "koi8_t",
1867    "koi8_u",
1868    "kz1048",
1869    "latin_1",
1870    "mac_cyrillic",
1871    "mac_greek",
1872    "mac_iceland",
1873    "mac_latin2",
1874    "mac_roman",
1875    "mac_turkish",
1876    "palmos",
1877    "ptcp154",
1878    "punycode",
1879    "raw_unicode_escape",
1880    "shift_jis",
1881    "shift_jis_2004",
1882    "shift_jisx0213",
1883    "tis_620",
1884    "unicode_escape",
1885    "utf_16",
1886    "utf_16_be",
1887    "utf_16_le",
1888    "utf_7",
1889    "utf_8",
1890]
1891
1892if hasattr(codecs, "mbcs_encode"):
1893    all_unicode_encodings.append("mbcs")
1894if hasattr(codecs, "oem_encode"):
1895    all_unicode_encodings.append("oem")
1896
1897# The following encoding is not tested, because it's not supposed
1898# to work:
1899#    "undefined"
1900
1901# The following encodings don't work in stateful mode
1902broken_unicode_with_stateful = [
1903    "punycode",
1904]
1905
1906
1907class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
1908    def test_basics(self):
1909        s = "abc123"  # all codecs should be able to encode these
1910        for encoding in all_unicode_encodings:
1911            name = codecs.lookup(encoding).name
1912            if encoding.endswith("_codec"):
1913                name += "_codec"
1914            elif encoding == "latin_1":
1915                name = "latin_1"
1916            # Skip the mbcs alias on Windows
1917            if name != "mbcs":
1918                self.assertEqual(encoding.replace("_", "-"),
1919                                 name.replace("_", "-"))
1920
1921            (b, size) = codecs.getencoder(encoding)(s)
1922            self.assertEqual(size, len(s), "encoding=%r" % encoding)
1923            (chars, size) = codecs.getdecoder(encoding)(b)
1924            self.assertEqual(chars, s, "encoding=%r" % encoding)
1925
1926            if encoding not in broken_unicode_with_stateful:
1927                # check stream reader/writer
1928                q = Queue(b"")
1929                writer = codecs.getwriter(encoding)(q)
1930                encodedresult = b""
1931                for c in s:
1932                    writer.write(c)
1933                    chunk = q.read()
1934                    self.assertTrue(type(chunk) is bytes, type(chunk))
1935                    encodedresult += chunk
1936                q = Queue(b"")
1937                reader = codecs.getreader(encoding)(q)
1938                decodedresult = ""
1939                for c in encodedresult:
1940                    q.write(bytes([c]))
1941                    decodedresult += reader.read()
1942                self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
1943
1944            if encoding not in broken_unicode_with_stateful:
1945                # check incremental decoder/encoder and iterencode()/iterdecode()
1946                try:
1947                    encoder = codecs.getincrementalencoder(encoding)()
1948                except LookupError:  # no IncrementalEncoder
1949                    pass
1950                else:
1951                    # check incremental decoder/encoder
1952                    encodedresult = b""
1953                    for c in s:
1954                        encodedresult += encoder.encode(c)
1955                    encodedresult += encoder.encode("", True)
1956                    decoder = codecs.getincrementaldecoder(encoding)()
1957                    decodedresult = ""
1958                    for c in encodedresult:
1959                        decodedresult += decoder.decode(bytes([c]))
1960                    decodedresult += decoder.decode(b"", True)
1961                    self.assertEqual(decodedresult, s,
1962                                     "encoding=%r" % encoding)
1963
1964                    # check iterencode()/iterdecode()
1965                    result = "".join(codecs.iterdecode(
1966                            codecs.iterencode(s, encoding), encoding))
1967                    self.assertEqual(result, s, "encoding=%r" % encoding)
1968
1969                    # check iterencode()/iterdecode() with empty string
1970                    result = "".join(codecs.iterdecode(
1971                            codecs.iterencode("", encoding), encoding))
1972                    self.assertEqual(result, "")
1973
1974                if encoding not in ("idna", "mbcs"):
1975                    # check incremental decoder/encoder with errors argument
1976                    try:
1977                        encoder = codecs.getincrementalencoder(encoding)("ignore")
1978                    except LookupError:  # no IncrementalEncoder
1979                        pass
1980                    else:
1981                        encodedresult = b"".join(encoder.encode(c) for c in s)
1982                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
1983                        decodedresult = "".join(decoder.decode(bytes([c]))
1984                                                for c in encodedresult)
1985                        self.assertEqual(decodedresult, s,
1986                                         "encoding=%r" % encoding)
1987
1988    @support.cpython_only
1989    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
1990    def test_basics_capi(self):
1991        s = "abc123"  # all codecs should be able to encode these
1992        for encoding in all_unicode_encodings:
1993            if encoding not in broken_unicode_with_stateful:
1994                # check incremental decoder/encoder (fetched via the C API)
1995                try:
1996                    cencoder = _testcapi.codec_incrementalencoder(encoding)
1997                except LookupError:  # no IncrementalEncoder
1998                    pass
1999                else:
2000                    # check C API
2001                    encodedresult = b""
2002                    for c in s:
2003                        encodedresult += cencoder.encode(c)
2004                    encodedresult += cencoder.encode("", True)
2005                    cdecoder = _testcapi.codec_incrementaldecoder(encoding)
2006                    decodedresult = ""
2007                    for c in encodedresult:
2008                        decodedresult += cdecoder.decode(bytes([c]))
2009                    decodedresult += cdecoder.decode(b"", True)
2010                    self.assertEqual(decodedresult, s,
2011                                     "encoding=%r" % encoding)
2012
2013                if encoding not in ("idna", "mbcs"):
2014                    # check incremental decoder/encoder with errors argument
2015                    try:
2016                        cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
2017                    except LookupError:  # no IncrementalEncoder
2018                        pass
2019                    else:
2020                        encodedresult = b"".join(cencoder.encode(c) for c in s)
2021                        cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
2022                        decodedresult = "".join(cdecoder.decode(bytes([c]))
2023                                                for c in encodedresult)
2024                        self.assertEqual(decodedresult, s,
2025                                         "encoding=%r" % encoding)
2026
2027    def test_seek(self):
2028        # all codecs should be able to encode these
2029        s = "%s\n%s\n" % (100*"abc123", 100*"def456")
2030        for encoding in all_unicode_encodings:
2031            if encoding == "idna": # FIXME: See SF bug #1163178
2032                continue
2033            if encoding in broken_unicode_with_stateful:
2034                continue
2035            reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
2036            for t in range(5):
2037                # Test that calling seek resets the internal codec state and buffers
2038                reader.seek(0, 0)
2039                data = reader.read()
2040                self.assertEqual(s, data)
2041
2042    def test_bad_decode_args(self):
2043        for encoding in all_unicode_encodings:
2044            decoder = codecs.getdecoder(encoding)
2045            self.assertRaises(TypeError, decoder)
2046            if encoding not in ("idna", "punycode"):
2047                self.assertRaises(TypeError, decoder, 42)
2048
2049    def test_bad_encode_args(self):
2050        for encoding in all_unicode_encodings:
2051            encoder = codecs.getencoder(encoding)
2052            self.assertRaises(TypeError, encoder)
2053
2054    def test_encoding_map_type_initialized(self):
2055        from encodings import cp1140
2056        # This used to crash, we are only verifying there's no crash.
2057        table_type = type(cp1140.encoding_table)
2058        self.assertEqual(table_type, table_type)
2059
2060    def test_decoder_state(self):
2061        # Check that getstate() and setstate() handle the state properly
2062        u = "abc123"
2063        for encoding in all_unicode_encodings:
2064            if encoding not in broken_unicode_with_stateful:
2065                self.check_state_handling_decode(encoding, u, u.encode(encoding))
2066                self.check_state_handling_encode(encoding, u, u.encode(encoding))
2067
2068
2069class CharmapTest(unittest.TestCase):
2070    def test_decode_with_string_map(self):
2071        self.assertEqual(
2072            codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
2073            ("abc", 3)
2074        )
2075
2076        self.assertEqual(
2077            codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2078            ("\U0010FFFFbc", 3)
2079        )
2080
2081        self.assertRaises(UnicodeDecodeError,
2082            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2083        )
2084
2085        self.assertRaises(UnicodeDecodeError,
2086            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2087        )
2088
2089        self.assertEqual(
2090            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
2091            ("ab\ufffd", 3)
2092        )
2093
2094        self.assertEqual(
2095            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
2096            ("ab\ufffd", 3)
2097        )
2098
2099        self.assertEqual(
2100            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2101            ("ab\\x02", 3)
2102        )
2103
2104        self.assertEqual(
2105            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2106            ("ab\\x02", 3)
2107        )
2108
2109        self.assertEqual(
2110            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
2111            ("ab", 3)
2112        )
2113
2114        self.assertEqual(
2115            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
2116            ("ab", 3)
2117        )
2118
2119        allbytes = bytes(range(256))
2120        self.assertEqual(
2121            codecs.charmap_decode(allbytes, "ignore", ""),
2122            ("", len(allbytes))
2123        )
2124
2125    def test_decode_with_int2str_map(self):
2126        self.assertEqual(
2127            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2128                                  {0: 'a', 1: 'b', 2: 'c'}),
2129            ("abc", 3)
2130        )
2131
2132        self.assertEqual(
2133            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2134                                  {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2135            ("AaBbCc", 3)
2136        )
2137
2138        self.assertEqual(
2139            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2140                                  {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2141            ("\U0010FFFFbc", 3)
2142        )
2143
2144        self.assertEqual(
2145            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2146                                  {0: 'a', 1: 'b', 2: ''}),
2147            ("ab", 3)
2148        )
2149
2150        self.assertRaises(UnicodeDecodeError,
2151            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2152                                   {0: 'a', 1: 'b'}
2153        )
2154
2155        self.assertRaises(UnicodeDecodeError,
2156            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2157                                   {0: 'a', 1: 'b', 2: None}
2158        )
2159
2160        # Issue #14850
2161        self.assertRaises(UnicodeDecodeError,
2162            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2163                                   {0: 'a', 1: 'b', 2: '\ufffe'}
2164        )
2165
2166        self.assertEqual(
2167            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2168                                  {0: 'a', 1: 'b'}),
2169            ("ab\ufffd", 3)
2170        )
2171
2172        self.assertEqual(
2173            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2174                                  {0: 'a', 1: 'b', 2: None}),
2175            ("ab\ufffd", 3)
2176        )
2177
2178        # Issue #14850
2179        self.assertEqual(
2180            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2181                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2182            ("ab\ufffd", 3)
2183        )
2184
2185        self.assertEqual(
2186            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2187                                  {0: 'a', 1: 'b'}),
2188            ("ab\\x02", 3)
2189        )
2190
2191        self.assertEqual(
2192            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2193                                  {0: 'a', 1: 'b', 2: None}),
2194            ("ab\\x02", 3)
2195        )
2196
2197        # Issue #14850
2198        self.assertEqual(
2199            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2200                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2201            ("ab\\x02", 3)
2202        )
2203
2204        self.assertEqual(
2205            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2206                                  {0: 'a', 1: 'b'}),
2207            ("ab", 3)
2208        )
2209
2210        self.assertEqual(
2211            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2212                                  {0: 'a', 1: 'b', 2: None}),
2213            ("ab", 3)
2214        )
2215
2216        # Issue #14850
2217        self.assertEqual(
2218            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2219                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2220            ("ab", 3)
2221        )
2222
2223        allbytes = bytes(range(256))
2224        self.assertEqual(
2225            codecs.charmap_decode(allbytes, "ignore", {}),
2226            ("", len(allbytes))
2227        )
2228
2229        self.assertRaisesRegex(TypeError,
2230            "character mapping must be in range\\(0x110000\\)",
2231            codecs.charmap_decode,
2232            b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: -2}
2233        )
2234
2235        self.assertRaisesRegex(TypeError,
2236            "character mapping must be in range\\(0x110000\\)",
2237            codecs.charmap_decode,
2238            b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999}
2239        )
2240
2241    def test_decode_with_int2int_map(self):
2242        a = ord('a')
2243        b = ord('b')
2244        c = ord('c')
2245
2246        self.assertEqual(
2247            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2248                                  {0: a, 1: b, 2: c}),
2249            ("abc", 3)
2250        )
2251
2252        # Issue #15379
2253        self.assertEqual(
2254            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2255                                  {0: 0x10FFFF, 1: b, 2: c}),
2256            ("\U0010FFFFbc", 3)
2257        )
2258
2259        self.assertEqual(
2260            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2261                                  {0: sys.maxunicode, 1: b, 2: c}),
2262            (chr(sys.maxunicode) + "bc", 3)
2263        )
2264
2265        self.assertRaises(TypeError,
2266            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2267                                   {0: sys.maxunicode + 1, 1: b, 2: c}
2268        )
2269
2270        self.assertRaises(UnicodeDecodeError,
2271            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2272                                   {0: a, 1: b},
2273        )
2274
2275        self.assertRaises(UnicodeDecodeError,
2276            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2277                                   {0: a, 1: b, 2: 0xFFFE},
2278        )
2279
2280        self.assertEqual(
2281            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2282                                  {0: a, 1: b}),
2283            ("ab\ufffd", 3)
2284        )
2285
2286        self.assertEqual(
2287            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2288                                  {0: a, 1: b, 2: 0xFFFE}),
2289            ("ab\ufffd", 3)
2290        )
2291
2292        self.assertEqual(
2293            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2294                                  {0: a, 1: b}),
2295            ("ab\\x02", 3)
2296        )
2297
2298        self.assertEqual(
2299            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2300                                  {0: a, 1: b, 2: 0xFFFE}),
2301            ("ab\\x02", 3)
2302        )
2303
2304        self.assertEqual(
2305            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2306                                  {0: a, 1: b}),
2307            ("ab", 3)
2308        )
2309
2310        self.assertEqual(
2311            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2312                                  {0: a, 1: b, 2: 0xFFFE}),
2313            ("ab", 3)
2314        )
2315
2316
2317class WithStmtTest(unittest.TestCase):
2318    def test_encodedfile(self):
2319        f = io.BytesIO(b"\xc3\xbc")
2320        with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2321            self.assertEqual(ef.read(), b"\xfc")
2322        self.assertTrue(f.closed)
2323
2324    def test_streamreaderwriter(self):
2325        f = io.BytesIO(b"\xc3\xbc")
2326        info = codecs.lookup("utf-8")
2327        with codecs.StreamReaderWriter(f, info.streamreader,
2328                                       info.streamwriter, 'strict') as srw:
2329            self.assertEqual(srw.read(), "\xfc")
2330
2331
2332class TypesTest(unittest.TestCase):
2333    def test_decode_unicode(self):
2334        # Most decoders don't accept unicode input
2335        decoders = [
2336            codecs.utf_7_decode,
2337            codecs.utf_8_decode,
2338            codecs.utf_16_le_decode,
2339            codecs.utf_16_be_decode,
2340            codecs.utf_16_ex_decode,
2341            codecs.utf_32_decode,
2342            codecs.utf_32_le_decode,
2343            codecs.utf_32_be_decode,
2344            codecs.utf_32_ex_decode,
2345            codecs.latin_1_decode,
2346            codecs.ascii_decode,
2347            codecs.charmap_decode,
2348        ]
2349        if hasattr(codecs, "mbcs_decode"):
2350            decoders.append(codecs.mbcs_decode)
2351        for decoder in decoders:
2352            self.assertRaises(TypeError, decoder, "xxx")
2353
2354    def test_unicode_escape(self):
2355        # Escape-decoding a unicode string is supported and gives the same
2356        # result as decoding the equivalent ASCII bytes string.
2357        self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2358        self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2359        self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2360        self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2361
2362        self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2363        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2364        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2365                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2366
2367        self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2368        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2369        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2370                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2371
2372
2373class UnicodeEscapeTest(ReadTest, unittest.TestCase):
2374    encoding = "unicode-escape"
2375
2376    test_lone_surrogates = None
2377
2378    def test_empty(self):
2379        self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2380        self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2381
2382    def test_raw_encode(self):
2383        encode = codecs.unicode_escape_encode
2384        for b in range(32, 127):
2385            if b != b'\\'[0]:
2386                self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2387
2388    def test_raw_decode(self):
2389        decode = codecs.unicode_escape_decode
2390        for b in range(256):
2391            if b != b'\\'[0]:
2392                self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2393
2394    def test_escape_encode(self):
2395        encode = codecs.unicode_escape_encode
2396        check = coding_checker(self, encode)
2397        check('\t', br'\t')
2398        check('\n', br'\n')
2399        check('\r', br'\r')
2400        check('\\', br'\\')
2401        for b in range(32):
2402            if chr(b) not in '\t\n\r':
2403                check(chr(b), ('\\x%02x' % b).encode())
2404        for b in range(127, 256):
2405            check(chr(b), ('\\x%02x' % b).encode())
2406        check('\u20ac', br'\u20ac')
2407        check('\U0001d120', br'\U0001d120')
2408
2409    def test_escape_decode(self):
2410        decode = codecs.unicode_escape_decode
2411        check = coding_checker(self, decode)
2412        check(b"[\\\n]", "[]")
2413        check(br'[\"]', '["]')
2414        check(br"[\']", "[']")
2415        check(br"[\\]", r"[\]")
2416        check(br"[\a]", "[\x07]")
2417        check(br"[\b]", "[\x08]")
2418        check(br"[\t]", "[\x09]")
2419        check(br"[\n]", "[\x0a]")
2420        check(br"[\v]", "[\x0b]")
2421        check(br"[\f]", "[\x0c]")
2422        check(br"[\r]", "[\x0d]")
2423        check(br"[\7]", "[\x07]")
2424        check(br"[\78]", "[\x078]")
2425        check(br"[\41]", "[!]")
2426        check(br"[\418]", "[!8]")
2427        check(br"[\101]", "[A]")
2428        check(br"[\1010]", "[A0]")
2429        check(br"[\x41]", "[A]")
2430        check(br"[\x410]", "[A0]")
2431        check(br"\u20ac", "\u20ac")
2432        check(br"\U0001d120", "\U0001d120")
2433        for i in range(97, 123):
2434            b = bytes([i])
2435            if b not in b'abfnrtuvx':
2436                with self.assertWarns(DeprecationWarning):
2437                    check(b"\\" + b, "\\" + chr(i))
2438            if b.upper() not in b'UN':
2439                with self.assertWarns(DeprecationWarning):
2440                    check(b"\\" + b.upper(), "\\" + chr(i-32))
2441        with self.assertWarns(DeprecationWarning):
2442            check(br"\8", "\\8")
2443        with self.assertWarns(DeprecationWarning):
2444            check(br"\9", "\\9")
2445        with self.assertWarns(DeprecationWarning):
2446            check(b"\\\xfa", "\\\xfa")
2447        for i in range(0o400, 0o1000):
2448            with self.assertWarns(DeprecationWarning):
2449                check(rb'\%o' % i, chr(i))
2450
2451    def test_decode_errors(self):
2452        decode = codecs.unicode_escape_decode
2453        for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2454            for i in range(d):
2455                self.assertRaises(UnicodeDecodeError, decode,
2456                                  b"\\" + c + b"0"*i)
2457                self.assertRaises(UnicodeDecodeError, decode,
2458                                  b"[\\" + c + b"0"*i + b"]")
2459                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2460                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2461                self.assertEqual(decode(data, "replace"),
2462                                 ("[\ufffd]\ufffd", len(data)))
2463        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2464        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2465        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2466
2467    def test_partial(self):
2468        self.check_partial(
2469            "\x00\t\n\r\\\xff\uffff\U00010000",
2470            [
2471                '',
2472                '',
2473                '',
2474                '\x00',
2475                '\x00',
2476                '\x00\t',
2477                '\x00\t',
2478                '\x00\t\n',
2479                '\x00\t\n',
2480                '\x00\t\n\r',
2481                '\x00\t\n\r',
2482                '\x00\t\n\r\\',
2483                '\x00\t\n\r\\',
2484                '\x00\t\n\r\\',
2485                '\x00\t\n\r\\',
2486                '\x00\t\n\r\\\xff',
2487                '\x00\t\n\r\\\xff',
2488                '\x00\t\n\r\\\xff',
2489                '\x00\t\n\r\\\xff',
2490                '\x00\t\n\r\\\xff',
2491                '\x00\t\n\r\\\xff',
2492                '\x00\t\n\r\\\xff\uffff',
2493                '\x00\t\n\r\\\xff\uffff',
2494                '\x00\t\n\r\\\xff\uffff',
2495                '\x00\t\n\r\\\xff\uffff',
2496                '\x00\t\n\r\\\xff\uffff',
2497                '\x00\t\n\r\\\xff\uffff',
2498                '\x00\t\n\r\\\xff\uffff',
2499                '\x00\t\n\r\\\xff\uffff',
2500                '\x00\t\n\r\\\xff\uffff',
2501                '\x00\t\n\r\\\xff\uffff',
2502                '\x00\t\n\r\\\xff\uffff\U00010000',
2503            ]
2504        )
2505
2506class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
2507    encoding = "raw-unicode-escape"
2508
2509    test_lone_surrogates = None
2510
2511    def test_empty(self):
2512        self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2513        self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2514
2515    def test_raw_encode(self):
2516        encode = codecs.raw_unicode_escape_encode
2517        for b in range(256):
2518            self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2519
2520    def test_raw_decode(self):
2521        decode = codecs.raw_unicode_escape_decode
2522        for b in range(256):
2523            self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2524
2525    def test_escape_encode(self):
2526        encode = codecs.raw_unicode_escape_encode
2527        check = coding_checker(self, encode)
2528        for b in range(256):
2529            if b not in b'uU':
2530                check('\\' + chr(b), b'\\' + bytes([b]))
2531        check('\u20ac', br'\u20ac')
2532        check('\U0001d120', br'\U0001d120')
2533
2534    def test_escape_decode(self):
2535        decode = codecs.raw_unicode_escape_decode
2536        check = coding_checker(self, decode)
2537        for b in range(256):
2538            if b not in b'uU':
2539                check(b'\\' + bytes([b]), '\\' + chr(b))
2540        check(br"\u20ac", "\u20ac")
2541        check(br"\U0001d120", "\U0001d120")
2542
2543    def test_decode_errors(self):
2544        decode = codecs.raw_unicode_escape_decode
2545        for c, d in (b'u', 4), (b'U', 4):
2546            for i in range(d):
2547                self.assertRaises(UnicodeDecodeError, decode,
2548                                  b"\\" + c + b"0"*i)
2549                self.assertRaises(UnicodeDecodeError, decode,
2550                                  b"[\\" + c + b"0"*i + b"]")
2551                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2552                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2553                self.assertEqual(decode(data, "replace"),
2554                                 ("[\ufffd]\ufffd", len(data)))
2555        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2556        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2557        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2558
2559    def test_partial(self):
2560        self.check_partial(
2561            "\x00\t\n\r\\\xff\uffff\U00010000",
2562            [
2563                '\x00',
2564                '\x00\t',
2565                '\x00\t\n',
2566                '\x00\t\n\r',
2567                '\x00\t\n\r',
2568                '\x00\t\n\r\\\xff',
2569                '\x00\t\n\r\\\xff',
2570                '\x00\t\n\r\\\xff',
2571                '\x00\t\n\r\\\xff',
2572                '\x00\t\n\r\\\xff',
2573                '\x00\t\n\r\\\xff',
2574                '\x00\t\n\r\\\xff\uffff',
2575                '\x00\t\n\r\\\xff\uffff',
2576                '\x00\t\n\r\\\xff\uffff',
2577                '\x00\t\n\r\\\xff\uffff',
2578                '\x00\t\n\r\\\xff\uffff',
2579                '\x00\t\n\r\\\xff\uffff',
2580                '\x00\t\n\r\\\xff\uffff',
2581                '\x00\t\n\r\\\xff\uffff',
2582                '\x00\t\n\r\\\xff\uffff',
2583                '\x00\t\n\r\\\xff\uffff',
2584                '\x00\t\n\r\\\xff\uffff\U00010000',
2585            ]
2586        )
2587
2588
2589class EscapeEncodeTest(unittest.TestCase):
2590
2591    def test_escape_encode(self):
2592        tests = [
2593            (b'', (b'', 0)),
2594            (b'foobar', (b'foobar', 6)),
2595            (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2596            (b'a\'b', (b"a\\'b", 3)),
2597            (b'b\\c', (b'b\\\\c', 3)),
2598            (b'c\nd', (b'c\\nd', 3)),
2599            (b'd\re', (b'd\\re', 3)),
2600            (b'f\x7fg', (b'f\\x7fg', 3)),
2601        ]
2602        for data, output in tests:
2603            with self.subTest(data=data):
2604                self.assertEqual(codecs.escape_encode(data), output)
2605        self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2606        self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2607
2608
2609class SurrogateEscapeTest(unittest.TestCase):
2610
2611    def test_utf8(self):
2612        # Bad byte
2613        self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
2614                         "foo\udc80bar")
2615        self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
2616                         b"foo\x80bar")
2617        # bad-utf-8 encoded surrogate
2618        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
2619                         "\udced\udcb0\udc80")
2620        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
2621                         b"\xed\xb0\x80")
2622
2623    def test_ascii(self):
2624        # bad byte
2625        self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
2626                         "foo\udc80bar")
2627        self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
2628                         b"foo\x80bar")
2629
2630    def test_charmap(self):
2631        # bad byte: \xa5 is unmapped in iso-8859-3
2632        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
2633                         "foo\udca5bar")
2634        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
2635                         b"foo\xa5bar")
2636
2637    def test_latin1(self):
2638        # Issue6373
2639        self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
2640                         b"\xe4\xeb\xef\xf6\xfc")
2641
2642
2643class BomTest(unittest.TestCase):
2644    def test_seek0(self):
2645        data = "1234567890"
2646        tests = ("utf-16",
2647                 "utf-16-le",
2648                 "utf-16-be",
2649                 "utf-32",
2650                 "utf-32-le",
2651                 "utf-32-be")
2652        self.addCleanup(os_helper.unlink, os_helper.TESTFN)
2653        for encoding in tests:
2654            # Check if the BOM is written only once
2655            with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
2656                f.write(data)
2657                f.write(data)
2658                f.seek(0)
2659                self.assertEqual(f.read(), data * 2)
2660                f.seek(0)
2661                self.assertEqual(f.read(), data * 2)
2662
2663            # Check that the BOM is written after a seek(0)
2664            with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
2665                f.write(data[0])
2666                self.assertNotEqual(f.tell(), 0)
2667                f.seek(0)
2668                f.write(data)
2669                f.seek(0)
2670                self.assertEqual(f.read(), data)
2671
2672            # (StreamWriter) Check that the BOM is written after a seek(0)
2673            with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
2674                f.writer.write(data[0])
2675                self.assertNotEqual(f.writer.tell(), 0)
2676                f.writer.seek(0)
2677                f.writer.write(data)
2678                f.seek(0)
2679                self.assertEqual(f.read(), data)
2680
2681            # Check that the BOM is not written after a seek() at a position
2682            # different than the start
2683            with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
2684                f.write(data)
2685                f.seek(f.tell())
2686                f.write(data)
2687                f.seek(0)
2688                self.assertEqual(f.read(), data * 2)
2689
2690            # (StreamWriter) Check that the BOM is not written after a seek()
2691            # at a position different than the start
2692            with codecs.open(os_helper.TESTFN, 'w+', encoding=encoding) as f:
2693                f.writer.write(data)
2694                f.writer.seek(f.writer.tell())
2695                f.writer.write(data)
2696                f.seek(0)
2697                self.assertEqual(f.read(), data * 2)
2698
2699
2700bytes_transform_encodings = [
2701    "base64_codec",
2702    "uu_codec",
2703    "quopri_codec",
2704    "hex_codec",
2705]
2706
2707transform_aliases = {
2708    "base64_codec": ["base64", "base_64"],
2709    "uu_codec": ["uu"],
2710    "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2711    "hex_codec": ["hex"],
2712    "rot_13": ["rot13"],
2713}
2714
2715try:
2716    import zlib
2717except ImportError:
2718    zlib = None
2719else:
2720    bytes_transform_encodings.append("zlib_codec")
2721    transform_aliases["zlib_codec"] = ["zip", "zlib"]
2722try:
2723    import bz2
2724except ImportError:
2725    pass
2726else:
2727    bytes_transform_encodings.append("bz2_codec")
2728    transform_aliases["bz2_codec"] = ["bz2"]
2729
2730
2731class TransformCodecTest(unittest.TestCase):
2732
2733    def test_basics(self):
2734        binput = bytes(range(256))
2735        for encoding in bytes_transform_encodings:
2736            with self.subTest(encoding=encoding):
2737                # generic codecs interface
2738                (o, size) = codecs.getencoder(encoding)(binput)
2739                self.assertEqual(size, len(binput))
2740                (i, size) = codecs.getdecoder(encoding)(o)
2741                self.assertEqual(size, len(o))
2742                self.assertEqual(i, binput)
2743
2744    def test_read(self):
2745        for encoding in bytes_transform_encodings:
2746            with self.subTest(encoding=encoding):
2747                sin = codecs.encode(b"\x80", encoding)
2748                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2749                sout = reader.read()
2750                self.assertEqual(sout, b"\x80")
2751
2752    def test_readline(self):
2753        for encoding in bytes_transform_encodings:
2754            with self.subTest(encoding=encoding):
2755                sin = codecs.encode(b"\x80", encoding)
2756                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2757                sout = reader.readline()
2758                self.assertEqual(sout, b"\x80")
2759
2760    def test_buffer_api_usage(self):
2761        # We check all the transform codecs accept memoryview input
2762        # for encoding and decoding
2763        # and also that they roundtrip correctly
2764        original = b"12345\x80"
2765        for encoding in bytes_transform_encodings:
2766            with self.subTest(encoding=encoding):
2767                data = original
2768                view = memoryview(data)
2769                data = codecs.encode(data, encoding)
2770                view_encoded = codecs.encode(view, encoding)
2771                self.assertEqual(view_encoded, data)
2772                view = memoryview(data)
2773                data = codecs.decode(data, encoding)
2774                self.assertEqual(data, original)
2775                view_decoded = codecs.decode(view, encoding)
2776                self.assertEqual(view_decoded, data)
2777
2778    def test_text_to_binary_denylists_binary_transforms(self):
2779        # Check binary -> binary codecs give a good error for str input
2780        bad_input = "bad input type"
2781        for encoding in bytes_transform_encodings:
2782            with self.subTest(encoding=encoding):
2783                fmt = (r"{!r} is not a text encoding; "
2784                       r"use codecs.encode\(\) to handle arbitrary codecs")
2785                msg = fmt.format(encoding)
2786                with self.assertRaisesRegex(LookupError, msg) as failure:
2787                    bad_input.encode(encoding)
2788                self.assertIsNone(failure.exception.__cause__)
2789
2790    def test_text_to_binary_denylists_text_transforms(self):
2791        # Check str.encode gives a good error message for str -> str codecs
2792        msg = (r"^'rot_13' is not a text encoding; "
2793               r"use codecs.encode\(\) to handle arbitrary codecs")
2794        with self.assertRaisesRegex(LookupError, msg):
2795            "just an example message".encode("rot_13")
2796
2797    def test_binary_to_text_denylists_binary_transforms(self):
2798        # Check bytes.decode and bytearray.decode give a good error
2799        # message for binary -> binary codecs
2800        data = b"encode first to ensure we meet any format restrictions"
2801        for encoding in bytes_transform_encodings:
2802            with self.subTest(encoding=encoding):
2803                encoded_data = codecs.encode(data, encoding)
2804                fmt = (r"{!r} is not a text encoding; "
2805                       r"use codecs.decode\(\) to handle arbitrary codecs")
2806                msg = fmt.format(encoding)
2807                with self.assertRaisesRegex(LookupError, msg):
2808                    encoded_data.decode(encoding)
2809                with self.assertRaisesRegex(LookupError, msg):
2810                    bytearray(encoded_data).decode(encoding)
2811
2812    def test_binary_to_text_denylists_text_transforms(self):
2813        # Check str -> str codec gives a good error for binary input
2814        for bad_input in (b"immutable", bytearray(b"mutable")):
2815            with self.subTest(bad_input=bad_input):
2816                msg = (r"^'rot_13' is not a text encoding; "
2817                       r"use codecs.decode\(\) to handle arbitrary codecs")
2818                with self.assertRaisesRegex(LookupError, msg) as failure:
2819                    bad_input.decode("rot_13")
2820                self.assertIsNone(failure.exception.__cause__)
2821
2822    @unittest.skipUnless(zlib, "Requires zlib support")
2823    def test_custom_zlib_error_is_wrapped(self):
2824        # Check zlib codec gives a good error for malformed input
2825        msg = "^decoding with 'zlib_codec' codec failed"
2826        with self.assertRaisesRegex(Exception, msg) as failure:
2827            codecs.decode(b"hello", "zlib_codec")
2828        self.assertIsInstance(failure.exception.__cause__,
2829                                                type(failure.exception))
2830
2831    def test_custom_hex_error_is_wrapped(self):
2832        # Check hex codec gives a good error for malformed input
2833        msg = "^decoding with 'hex_codec' codec failed"
2834        with self.assertRaisesRegex(Exception, msg) as failure:
2835            codecs.decode(b"hello", "hex_codec")
2836        self.assertIsInstance(failure.exception.__cause__,
2837                                                type(failure.exception))
2838
2839    # Unfortunately, the bz2 module throws OSError, which the codec
2840    # machinery currently can't wrap :(
2841
2842    # Ensure codec aliases from http://bugs.python.org/issue7475 work
2843    def test_aliases(self):
2844        for codec_name, aliases in transform_aliases.items():
2845            expected_name = codecs.lookup(codec_name).name
2846            for alias in aliases:
2847                with self.subTest(alias=alias):
2848                    info = codecs.lookup(alias)
2849                    self.assertEqual(info.name, expected_name)
2850
2851    def test_quopri_stateless(self):
2852        # Should encode with quotetabs=True
2853        encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2854        self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2855        # But should still support unescaped tabs and spaces
2856        unescaped = b"space tab eol\n"
2857        self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2858
2859    def test_uu_invalid(self):
2860        # Missing "begin" line
2861        self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2862
2863
2864# The codec system tries to wrap exceptions in order to ensure the error
2865# mentions the operation being performed and the codec involved. We
2866# currently *only* want this to happen for relatively stateless
2867# exceptions, where the only significant information they contain is their
2868# type and a single str argument.
2869
2870# Use a local codec registry to avoid appearing to leak objects when
2871# registering multiple search functions
2872_TEST_CODECS = {}
2873
2874def _get_test_codec(codec_name):
2875    return _TEST_CODECS.get(codec_name)
2876
2877
2878class ExceptionChainingTest(unittest.TestCase):
2879
2880    def setUp(self):
2881        self.codec_name = 'exception_chaining_test'
2882        codecs.register(_get_test_codec)
2883        self.addCleanup(codecs.unregister, _get_test_codec)
2884
2885        # We store the object to raise on the instance because of a bad
2886        # interaction between the codec caching (which means we can't
2887        # recreate the codec entry) and regrtest refleak hunting (which
2888        # runs the same test instance multiple times). This means we
2889        # need to ensure the codecs call back in to the instance to find
2890        # out which exception to raise rather than binding them in a
2891        # closure to an object that may change on the next run
2892        self.obj_to_raise = RuntimeError
2893
2894    def tearDown(self):
2895        _TEST_CODECS.pop(self.codec_name, None)
2896        # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2897        encodings._cache.pop(self.codec_name, None)
2898
2899    def set_codec(self, encode, decode):
2900        codec_info = codecs.CodecInfo(encode, decode,
2901                                      name=self.codec_name)
2902        _TEST_CODECS[self.codec_name] = codec_info
2903
2904    @contextlib.contextmanager
2905    def assertWrapped(self, operation, exc_type, msg):
2906        full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
2907                  operation, self.codec_name, exc_type.__name__, msg)
2908        with self.assertRaisesRegex(exc_type, full_msg) as caught:
2909            yield caught
2910        self.assertIsInstance(caught.exception.__cause__, exc_type)
2911        self.assertIsNotNone(caught.exception.__cause__.__traceback__)
2912
2913    def raise_obj(self, *args, **kwds):
2914        # Helper to dynamically change the object raised by a test codec
2915        raise self.obj_to_raise
2916
2917    def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2918        self.obj_to_raise = obj_to_raise
2919        self.set_codec(self.raise_obj, self.raise_obj)
2920        with self.assertWrapped("encoding", exc_type, msg):
2921            "str_input".encode(self.codec_name)
2922        with self.assertWrapped("encoding", exc_type, msg):
2923            codecs.encode("str_input", self.codec_name)
2924        with self.assertWrapped("decoding", exc_type, msg):
2925            b"bytes input".decode(self.codec_name)
2926        with self.assertWrapped("decoding", exc_type, msg):
2927            codecs.decode(b"bytes input", self.codec_name)
2928
2929    def test_raise_by_type(self):
2930        self.check_wrapped(RuntimeError, "")
2931
2932    def test_raise_by_value(self):
2933        msg = "This should be wrapped"
2934        self.check_wrapped(RuntimeError(msg), msg)
2935
2936    def test_raise_grandchild_subclass_exact_size(self):
2937        msg = "This should be wrapped"
2938        class MyRuntimeError(RuntimeError):
2939            __slots__ = ()
2940        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2941
2942    def test_raise_subclass_with_weakref_support(self):
2943        msg = "This should be wrapped"
2944        class MyRuntimeError(RuntimeError):
2945            pass
2946        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2947
2948    def check_not_wrapped(self, obj_to_raise, msg):
2949        def raise_obj(*args, **kwds):
2950            raise obj_to_raise
2951        self.set_codec(raise_obj, raise_obj)
2952        with self.assertRaisesRegex(RuntimeError, msg):
2953            "str input".encode(self.codec_name)
2954        with self.assertRaisesRegex(RuntimeError, msg):
2955            codecs.encode("str input", self.codec_name)
2956        with self.assertRaisesRegex(RuntimeError, msg):
2957            b"bytes input".decode(self.codec_name)
2958        with self.assertRaisesRegex(RuntimeError, msg):
2959            codecs.decode(b"bytes input", self.codec_name)
2960
2961    def test_init_override_is_not_wrapped(self):
2962        class CustomInit(RuntimeError):
2963            def __init__(self):
2964                pass
2965        self.check_not_wrapped(CustomInit, "")
2966
2967    def test_new_override_is_not_wrapped(self):
2968        class CustomNew(RuntimeError):
2969            def __new__(cls):
2970                return super().__new__(cls)
2971        self.check_not_wrapped(CustomNew, "")
2972
2973    def test_instance_attribute_is_not_wrapped(self):
2974        msg = "This should NOT be wrapped"
2975        exc = RuntimeError(msg)
2976        exc.attr = 1
2977        self.check_not_wrapped(exc, "^{}$".format(msg))
2978
2979    def test_non_str_arg_is_not_wrapped(self):
2980        self.check_not_wrapped(RuntimeError(1), "1")
2981
2982    def test_multiple_args_is_not_wrapped(self):
2983        msg_re = r"^\('a', 'b', 'c'\)$"
2984        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
2985
2986    # http://bugs.python.org/issue19609
2987    def test_codec_lookup_failure_not_wrapped(self):
2988        msg = "^unknown encoding: {}$".format(self.codec_name)
2989        # The initial codec lookup should not be wrapped
2990        with self.assertRaisesRegex(LookupError, msg):
2991            "str input".encode(self.codec_name)
2992        with self.assertRaisesRegex(LookupError, msg):
2993            codecs.encode("str input", self.codec_name)
2994        with self.assertRaisesRegex(LookupError, msg):
2995            b"bytes input".decode(self.codec_name)
2996        with self.assertRaisesRegex(LookupError, msg):
2997            codecs.decode(b"bytes input", self.codec_name)
2998
2999    def test_unflagged_non_text_codec_handling(self):
3000        # The stdlib non-text codecs are now marked so they're
3001        # pre-emptively skipped by the text model related methods
3002        # However, third party codecs won't be flagged, so we still make
3003        # sure the case where an inappropriate output type is produced is
3004        # handled appropriately
3005        def encode_to_str(*args, **kwds):
3006            return "not bytes!", 0
3007        def decode_to_bytes(*args, **kwds):
3008            return b"not str!", 0
3009        self.set_codec(encode_to_str, decode_to_bytes)
3010        # No input or output type checks on the codecs module functions
3011        encoded = codecs.encode(None, self.codec_name)
3012        self.assertEqual(encoded, "not bytes!")
3013        decoded = codecs.decode(None, self.codec_name)
3014        self.assertEqual(decoded, b"not str!")
3015        # Text model methods should complain
3016        fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
3017               r"use codecs.encode\(\) to encode to arbitrary types$")
3018        msg = fmt.format(self.codec_name)
3019        with self.assertRaisesRegex(TypeError, msg):
3020            "str_input".encode(self.codec_name)
3021        fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
3022               r"use codecs.decode\(\) to decode to arbitrary types$")
3023        msg = fmt.format(self.codec_name)
3024        with self.assertRaisesRegex(TypeError, msg):
3025            b"bytes input".decode(self.codec_name)
3026
3027
3028
3029@unittest.skipUnless(sys.platform == 'win32',
3030                     'code pages are specific to Windows')
3031class CodePageTest(unittest.TestCase):
3032    CP_UTF8 = 65001
3033
3034    def test_invalid_code_page(self):
3035        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3036        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
3037        self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3038        self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
3039
3040    def test_code_page_name(self):
3041        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3042            codecs.code_page_encode, 932, '\xff')
3043        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
3044            codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
3045        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
3046            codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
3047
3048    def check_decode(self, cp, tests):
3049        for raw, errors, expected in tests:
3050            if expected is not None:
3051                try:
3052                    decoded = codecs.code_page_decode(cp, raw, errors, True)
3053                except UnicodeDecodeError as err:
3054                    self.fail('Unable to decode %a from "cp%s" with '
3055                              'errors=%r: %s' % (raw, cp, errors, err))
3056                self.assertEqual(decoded[0], expected,
3057                    '%a.decode("cp%s", %r)=%a != %a'
3058                    % (raw, cp, errors, decoded[0], expected))
3059                # assert 0 <= decoded[1] <= len(raw)
3060                self.assertGreaterEqual(decoded[1], 0)
3061                self.assertLessEqual(decoded[1], len(raw))
3062            else:
3063                self.assertRaises(UnicodeDecodeError,
3064                    codecs.code_page_decode, cp, raw, errors, True)
3065
3066    def check_encode(self, cp, tests):
3067        for text, errors, expected in tests:
3068            if expected is not None:
3069                try:
3070                    encoded = codecs.code_page_encode(cp, text, errors)
3071                except UnicodeEncodeError as err:
3072                    self.fail('Unable to encode %a to "cp%s" with '
3073                              'errors=%r: %s' % (text, cp, errors, err))
3074                self.assertEqual(encoded[0], expected,
3075                    '%a.encode("cp%s", %r)=%a != %a'
3076                    % (text, cp, errors, encoded[0], expected))
3077                self.assertEqual(encoded[1], len(text))
3078            else:
3079                self.assertRaises(UnicodeEncodeError,
3080                    codecs.code_page_encode, cp, text, errors)
3081
3082    def test_cp932(self):
3083        self.check_encode(932, (
3084            ('abc', 'strict', b'abc'),
3085            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
3086            # test error handlers
3087            ('\xff', 'strict', None),
3088            ('[\xff]', 'ignore', b'[]'),
3089            ('[\xff]', 'replace', b'[y]'),
3090            ('[\u20ac]', 'replace', b'[?]'),
3091            ('[\xff]', 'backslashreplace', b'[\\xff]'),
3092            ('[\xff]', 'namereplace',
3093             b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
3094            ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
3095            ('\udcff', 'strict', None),
3096            ('[\udcff]', 'surrogateescape', b'[\xff]'),
3097            ('[\udcff]', 'surrogatepass', None),
3098        ))
3099        self.check_decode(932, (
3100            (b'abc', 'strict', 'abc'),
3101            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3102            # invalid bytes
3103            (b'[\xff]', 'strict', None),
3104            (b'[\xff]', 'ignore', '[]'),
3105            (b'[\xff]', 'replace', '[\ufffd]'),
3106            (b'[\xff]', 'backslashreplace', '[\\xff]'),
3107            (b'[\xff]', 'surrogateescape', '[\udcff]'),
3108            (b'[\xff]', 'surrogatepass', None),
3109            (b'\x81\x00abc', 'strict', None),
3110            (b'\x81\x00abc', 'ignore', '\x00abc'),
3111            (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
3112            (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
3113        ))
3114
3115    def test_cp1252(self):
3116        self.check_encode(1252, (
3117            ('abc', 'strict', b'abc'),
3118            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
3119            ('\xff', 'strict', b'\xff'),
3120            # test error handlers
3121            ('\u0141', 'strict', None),
3122            ('\u0141', 'ignore', b''),
3123            ('\u0141', 'replace', b'L'),
3124            ('\udc98', 'surrogateescape', b'\x98'),
3125            ('\udc98', 'surrogatepass', None),
3126        ))
3127        self.check_decode(1252, (
3128            (b'abc', 'strict', 'abc'),
3129            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3130            (b'\xff', 'strict', '\xff'),
3131        ))
3132
3133    def test_cp_utf7(self):
3134        cp = 65000
3135        self.check_encode(cp, (
3136            ('abc', 'strict', b'abc'),
3137            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
3138            ('\U0010ffff', 'strict',  b'+2//f/w-'),
3139            ('\udc80', 'strict', b'+3IA-'),
3140            ('\ufffd', 'strict', b'+//0-'),
3141        ))
3142        self.check_decode(cp, (
3143            (b'abc', 'strict', 'abc'),
3144            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3145            (b'+2//f/w-', 'strict', '\U0010ffff'),
3146            (b'+3IA-', 'strict', '\udc80'),
3147            (b'+//0-', 'strict', '\ufffd'),
3148            # invalid bytes
3149            (b'[+/]', 'strict', '[]'),
3150            (b'[\xff]', 'strict', '[\xff]'),
3151        ))
3152
3153    def test_multibyte_encoding(self):
3154        self.check_decode(932, (
3155            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3156            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3157        ))
3158        self.check_decode(self.CP_UTF8, (
3159            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3160            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3161        ))
3162        self.check_encode(self.CP_UTF8, (
3163            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3164            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3165        ))
3166
3167    def test_code_page_decode_flags(self):
3168        # Issue #36312: For some code pages (e.g. UTF-7) flags for
3169        # MultiByteToWideChar() must be set to 0.
3170        if support.verbose:
3171            sys.stdout.write('\n')
3172        for cp in (50220, 50221, 50222, 50225, 50227, 50229,
3173                   *range(57002, 57011+1), 65000):
3174            # On small versions of Windows like Windows IoT
3175            # not all codepages are present.
3176            # A missing codepage causes an OSError exception
3177            # so check for the codepage before decoding
3178            if is_code_page_present(cp):
3179                self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3), f'cp{cp}')
3180            else:
3181                if support.verbose:
3182                    print(f"  skipping cp={cp}")
3183        self.assertEqual(codecs.code_page_decode(42, b'abc'),
3184                         ('\uf061\uf062\uf063', 3))
3185
3186    def test_incremental(self):
3187        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3188        self.assertEqual(decoded, ('', 0))
3189
3190        decoded = codecs.code_page_decode(932,
3191                                          b'\xe9\x80\xe9', 'strict',
3192                                          False)
3193        self.assertEqual(decoded, ('\u9a3e', 2))
3194
3195        decoded = codecs.code_page_decode(932,
3196                                          b'\xe9\x80\xe9\x80', 'strict',
3197                                          False)
3198        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3199
3200        decoded = codecs.code_page_decode(932,
3201                                          b'abc', 'strict',
3202                                          False)
3203        self.assertEqual(decoded, ('abc', 3))
3204
3205    def test_mbcs_alias(self):
3206        # Check that looking up our 'default' codepage will return
3207        # mbcs when we don't have a more specific one available
3208        code_page = 99_999
3209        name = f'cp{code_page}'
3210        with mock.patch('_winapi.GetACP', return_value=code_page):
3211            try:
3212                codec = codecs.lookup(name)
3213                self.assertEqual(codec.name, 'mbcs')
3214            finally:
3215                codecs.unregister(name)
3216
3217    @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3218    def test_large_input(self, size):
3219        # Test input longer than INT_MAX.
3220        # Input should contain undecodable bytes before and after
3221        # the INT_MAX limit.
3222        encoded = (b'01234567' * ((size//8)-1) +
3223                   b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3224        self.assertEqual(len(encoded), size+2)
3225        decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3226        self.assertEqual(decoded[1], len(encoded))
3227        del encoded
3228        self.assertEqual(len(decoded[0]), decoded[1])
3229        self.assertEqual(decoded[0][:10], '0123456701')
3230        self.assertEqual(decoded[0][-20:],
3231                         '6701234567'
3232                         '\udc85\udc86\udcea\udceb\udcec'
3233                         '\udcef\udcfc\udcfd\udcfe\udcff')
3234
3235    @support.bigmemtest(size=2**31, memuse=6, dry_run=False)
3236    def test_large_utf8_input(self, size):
3237        # Test input longer than INT_MAX.
3238        # Input should contain a decodable multi-byte character
3239        # surrounding INT_MAX
3240        encoded = (b'0123456\xed\x84\x80' * (size//8))
3241        self.assertEqual(len(encoded), size // 8 * 10)
3242        decoded = codecs.code_page_decode(65001, encoded, 'ignore', True)
3243        self.assertEqual(decoded[1], len(encoded))
3244        del encoded
3245        self.assertEqual(len(decoded[0]), size)
3246        self.assertEqual(decoded[0][:10], '0123456\ud10001')
3247        self.assertEqual(decoded[0][-11:], '56\ud1000123456\ud100')
3248
3249
3250class ASCIITest(unittest.TestCase):
3251    def test_encode(self):
3252        self.assertEqual('abc123'.encode('ascii'), b'abc123')
3253
3254    def test_encode_error(self):
3255        for data, error_handler, expected in (
3256            ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3257            ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3258            ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3259            ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3260             b'[\\x80\\xff\\u20ac\\U000abcde]'),
3261            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3262        ):
3263            with self.subTest(data=data, error_handler=error_handler,
3264                              expected=expected):
3265                self.assertEqual(data.encode('ascii', error_handler),
3266                                 expected)
3267
3268    def test_encode_surrogateescape_error(self):
3269        with self.assertRaises(UnicodeEncodeError):
3270            # the first character can be decoded, but not the second
3271            '\udc80\xff'.encode('ascii', 'surrogateescape')
3272
3273    def test_decode(self):
3274        self.assertEqual(b'abc'.decode('ascii'), 'abc')
3275
3276    def test_decode_error(self):
3277        for data, error_handler, expected in (
3278            (b'[\x80\xff]', 'ignore', '[]'),
3279            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3280            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3281            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3282        ):
3283            with self.subTest(data=data, error_handler=error_handler,
3284                              expected=expected):
3285                self.assertEqual(data.decode('ascii', error_handler),
3286                                 expected)
3287
3288
3289class Latin1Test(unittest.TestCase):
3290    def test_encode(self):
3291        for data, expected in (
3292            ('abc', b'abc'),
3293            ('\x80\xe9\xff', b'\x80\xe9\xff'),
3294        ):
3295            with self.subTest(data=data, expected=expected):
3296                self.assertEqual(data.encode('latin1'), expected)
3297
3298    def test_encode_errors(self):
3299        for data, error_handler, expected in (
3300            ('[\u20ac\udc80]', 'ignore', b'[]'),
3301            ('[\u20ac\udc80]', 'replace', b'[??]'),
3302            ('[\u20ac\U000abcde]', 'backslashreplace',
3303             b'[\\u20ac\\U000abcde]'),
3304            ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3305            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3306        ):
3307            with self.subTest(data=data, error_handler=error_handler,
3308                              expected=expected):
3309                self.assertEqual(data.encode('latin1', error_handler),
3310                                 expected)
3311
3312    def test_encode_surrogateescape_error(self):
3313        with self.assertRaises(UnicodeEncodeError):
3314            # the first character can be decoded, but not the second
3315            '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3316
3317    def test_decode(self):
3318        for data, expected in (
3319            (b'abc', 'abc'),
3320            (b'[\x80\xff]', '[\x80\xff]'),
3321        ):
3322            with self.subTest(data=data, expected=expected):
3323                self.assertEqual(data.decode('latin1'), expected)
3324
3325
3326class StreamRecoderTest(unittest.TestCase):
3327    def test_writelines(self):
3328        bio = io.BytesIO()
3329        codec = codecs.lookup('ascii')
3330        sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3331                                  encodings.ascii.StreamReader, encodings.ascii.StreamWriter)
3332        sr.writelines([b'a', b'b'])
3333        self.assertEqual(bio.getvalue(), b'ab')
3334
3335    def test_write(self):
3336        bio = io.BytesIO()
3337        codec = codecs.lookup('latin1')
3338        # Recode from Latin-1 to utf-8.
3339        sr = codecs.StreamRecoder(bio, codec.encode, codec.decode,
3340                                  encodings.utf_8.StreamReader, encodings.utf_8.StreamWriter)
3341
3342        text = 'àñé'
3343        sr.write(text.encode('latin1'))
3344        self.assertEqual(bio.getvalue(), text.encode('utf-8'))
3345
3346    def test_seeking_read(self):
3347        bio = io.BytesIO('line1\nline2\nline3\n'.encode('utf-16-le'))
3348        sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3349
3350        self.assertEqual(sr.readline(), b'line1\n')
3351        sr.seek(0)
3352        self.assertEqual(sr.readline(), b'line1\n')
3353        self.assertEqual(sr.readline(), b'line2\n')
3354        self.assertEqual(sr.readline(), b'line3\n')
3355        self.assertEqual(sr.readline(), b'')
3356
3357    def test_seeking_write(self):
3358        bio = io.BytesIO('123456789\n'.encode('utf-16-le'))
3359        sr = codecs.EncodedFile(bio, 'utf-8', 'utf-16-le')
3360
3361        # Test that seek() only resets its internal buffer when offset
3362        # and whence are zero.
3363        sr.seek(2)
3364        sr.write(b'\nabc\n')
3365        self.assertEqual(sr.readline(), b'789\n')
3366        sr.seek(0)
3367        self.assertEqual(sr.readline(), b'1\n')
3368        self.assertEqual(sr.readline(), b'abc\n')
3369        self.assertEqual(sr.readline(), b'789\n')
3370
3371
3372@unittest.skipIf(_testinternalcapi is None, 'need _testinternalcapi module')
3373class LocaleCodecTest(unittest.TestCase):
3374    """
3375    Test indirectly _Py_DecodeUTF8Ex() and _Py_EncodeUTF8Ex().
3376    """
3377    ENCODING = sys.getfilesystemencoding()
3378    STRINGS = ("ascii", "ulatin1:\xa7\xe9",
3379               "u255:\xff",
3380               "UCS:\xe9\u20ac\U0010ffff",
3381               "surrogates:\uDC80\uDCFF")
3382    BYTES_STRINGS = (b"blatin1:\xa7\xe9", b"b255:\xff")
3383    SURROGATES = "\uDC80\uDCFF"
3384
3385    def encode(self, text, errors="strict"):
3386        return _testinternalcapi.EncodeLocaleEx(text, 0, errors)
3387
3388    def check_encode_strings(self, errors):
3389        for text in self.STRINGS:
3390            with self.subTest(text=text):
3391                try:
3392                    expected = text.encode(self.ENCODING, errors)
3393                except UnicodeEncodeError:
3394                    with self.assertRaises(RuntimeError) as cm:
3395                        self.encode(text, errors)
3396                    errmsg = str(cm.exception)
3397                    self.assertRegex(errmsg, r"encode error: pos=[0-9]+, reason=")
3398                else:
3399                    encoded = self.encode(text, errors)
3400                    self.assertEqual(encoded, expected)
3401
3402    def test_encode_strict(self):
3403        self.check_encode_strings("strict")
3404
3405    def test_encode_surrogateescape(self):
3406        self.check_encode_strings("surrogateescape")
3407
3408    def test_encode_surrogatepass(self):
3409        try:
3410            self.encode('', 'surrogatepass')
3411        except ValueError as exc:
3412            if str(exc) == 'unsupported error handler':
3413                self.skipTest(f"{self.ENCODING!r} encoder doesn't support "
3414                              f"surrogatepass error handler")
3415            else:
3416                raise
3417
3418        self.check_encode_strings("surrogatepass")
3419
3420    def test_encode_unsupported_error_handler(self):
3421        with self.assertRaises(ValueError) as cm:
3422            self.encode('', 'backslashreplace')
3423        self.assertEqual(str(cm.exception), 'unsupported error handler')
3424
3425    def decode(self, encoded, errors="strict"):
3426        return _testinternalcapi.DecodeLocaleEx(encoded, 0, errors)
3427
3428    def check_decode_strings(self, errors):
3429        is_utf8 = (self.ENCODING == "utf-8")
3430        if is_utf8:
3431            encode_errors = 'surrogateescape'
3432        else:
3433            encode_errors = 'strict'
3434
3435        strings = list(self.BYTES_STRINGS)
3436        for text in self.STRINGS:
3437            try:
3438                encoded = text.encode(self.ENCODING, encode_errors)
3439                if encoded not in strings:
3440                    strings.append(encoded)
3441            except UnicodeEncodeError:
3442                encoded = None
3443
3444            if is_utf8:
3445                encoded2 = text.encode(self.ENCODING, 'surrogatepass')
3446                if encoded2 != encoded:
3447                    strings.append(encoded2)
3448
3449        for encoded in strings:
3450            with self.subTest(encoded=encoded):
3451                try:
3452                    expected = encoded.decode(self.ENCODING, errors)
3453                except UnicodeDecodeError:
3454                    with self.assertRaises(RuntimeError) as cm:
3455                        self.decode(encoded, errors)
3456                    errmsg = str(cm.exception)
3457                    self.assertTrue(errmsg.startswith("decode error: "), errmsg)
3458                else:
3459                    decoded = self.decode(encoded, errors)
3460                    self.assertEqual(decoded, expected)
3461
3462    def test_decode_strict(self):
3463        self.check_decode_strings("strict")
3464
3465    def test_decode_surrogateescape(self):
3466        self.check_decode_strings("surrogateescape")
3467
3468    def test_decode_surrogatepass(self):
3469        try:
3470            self.decode(b'', 'surrogatepass')
3471        except ValueError as exc:
3472            if str(exc) == 'unsupported error handler':
3473                self.skipTest(f"{self.ENCODING!r} decoder doesn't support "
3474                              f"surrogatepass error handler")
3475            else:
3476                raise
3477
3478        self.check_decode_strings("surrogatepass")
3479
3480    def test_decode_unsupported_error_handler(self):
3481        with self.assertRaises(ValueError) as cm:
3482            self.decode(b'', 'backslashreplace')
3483        self.assertEqual(str(cm.exception), 'unsupported error handler')
3484
3485
3486class Rot13Test(unittest.TestCase):
3487    """Test the educational ROT-13 codec."""
3488    def test_encode(self):
3489        ciphertext = codecs.encode("Caesar liked ciphers", 'rot-13')
3490        self.assertEqual(ciphertext, 'Pnrfne yvxrq pvcuref')
3491
3492    def test_decode(self):
3493        plaintext = codecs.decode('Rg gh, Oehgr?', 'rot-13')
3494        self.assertEqual(plaintext, 'Et tu, Brute?')
3495
3496    def test_incremental_encode(self):
3497        encoder = codecs.getincrementalencoder('rot-13')()
3498        ciphertext = encoder.encode('ABBA nag Cheryl Baker')
3499        self.assertEqual(ciphertext, 'NOON ant Purely Onxre')
3500
3501    def test_incremental_decode(self):
3502        decoder = codecs.getincrementaldecoder('rot-13')()
3503        plaintext = decoder.decode('terra Ares envy tha')
3504        self.assertEqual(plaintext, 'green Nerf rail gun')
3505
3506
3507class Rot13UtilTest(unittest.TestCase):
3508    """Test the ROT-13 codec via rot13 function,
3509    i.e. the user has done something like:
3510    $ echo "Hello World" | python -m encodings.rot_13
3511    """
3512    def test_rot13_func(self):
3513        infile = io.StringIO('Gb or, be abg gb or, gung vf gur dhrfgvba')
3514        outfile = io.StringIO()
3515        encodings.rot_13.rot13(infile, outfile)
3516        outfile.seek(0)
3517        plain_text = outfile.read()
3518        self.assertEqual(
3519            plain_text,
3520            'To be, or not to be, that is the question')
3521
3522
3523class CodecNameNormalizationTest(unittest.TestCase):
3524    """Test codec name normalization"""
3525    def test_codecs_lookup(self):
3526        FOUND = (1, 2, 3, 4)
3527        NOT_FOUND = (None, None, None, None)
3528        def search_function(encoding):
3529            if encoding == "aaa_8":
3530                return FOUND
3531            else:
3532                return NOT_FOUND
3533
3534        codecs.register(search_function)
3535        self.addCleanup(codecs.unregister, search_function)
3536        self.assertEqual(FOUND, codecs.lookup('aaa_8'))
3537        self.assertEqual(FOUND, codecs.lookup('AAA-8'))
3538        self.assertEqual(FOUND, codecs.lookup('AAA---8'))
3539        self.assertEqual(FOUND, codecs.lookup('AAA   8'))
3540        self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
3541        self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
3542        self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
3543        self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
3544        self.assertEqual(NOT_FOUND, codecs.lookup('BBB.8'))
3545        self.assertEqual(NOT_FOUND, codecs.lookup('a\xe9\u20ac-8'))
3546
3547    def test_encodings_normalize_encoding(self):
3548        # encodings.normalize_encoding() ignores non-ASCII characters.
3549        normalize = encodings.normalize_encoding
3550        self.assertEqual(normalize('utf_8'), 'utf_8')
3551        self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
3552        self.assertEqual(normalize('utf   8'), 'utf_8')
3553        # encodings.normalize_encoding() doesn't convert
3554        # characters to lower case.
3555        self.assertEqual(normalize('UTF 8'), 'UTF_8')
3556        self.assertEqual(normalize('utf.8'), 'utf.8')
3557        self.assertEqual(normalize('utf...8'), 'utf...8')
3558
3559
3560if __name__ == "__main__":
3561    unittest.main()
3562